Merge pull request #363 from urvashik/master

lukaszkaiser · web-flow · commit 0ecbef81c6c1 · 2017-10-17T15:02:34.000-07:00
CNN/Dailymail Summarization
diff --git a/tensor2tensor/data_generators/cnn_dailymail.py b/tensor2tensor/data_generators/cnn_dailymail.py
@@ -21,6 +21,7 @@
 
 import os
 import tarfile
+import hashlib
 
 # Dependency imports
 
@@ -38,19 +39,31 @@
 
 _DAILYMAIL_STORIES_DRIVE_URL = "https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs"
 
+# Note: using See et al. (2017) as reference for data generation
+# For more info, use the links below
+
+# Train/Dev/Test Splits for summarization data
+_TRAIN_URLS = "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt"
+_DEV_URLS = "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt"
+_TEST_URLS = "https://github.com/abisee/cnn-dailymail/blob/master/url_lists/all_test.txt"
 
 # End-of-sentence marker.
 EOS = text_encoder.EOS_ID
 
+# Techniques for data prep from See et al. (2017)
+dm_single_close_quote = u'\u2019' # unicode
+dm_double_close_quote = u'\u201d'
+END_TOKENS = [u'.', u'!', u'?', u'...', u"'", u"`", u'"', dm_single_close_quote, dm_double_close_quote, u")"] # acceptable ways to end a sentence
+
 
-def _maybe_download_corpora(tmp_dir):
+def _maybe_download_corpora(tmp_dir, is_training):
   """Download corpora if necessary and unzip them.
 
   Args:
     tmp_dir: directory containing dataset.
 
   Returns:
-    filepath of the downloaded corpus file.
+    list of all files generated and path to file containing train/dev/test split info.
   """
   cnn_filename = "cnn_stories.tgz"
   cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/")
@@ -66,29 +79,87 @@ def _maybe_download_corpora(tmp_dir):
         tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL)
     with tarfile.open(dailymail_file, "r:gz") as dailymail_tar:
       dailymail_tar.extractall(tmp_dir)
-  return [cnn_finalpath, dailymail_finalpath]
-
-
-def story_generator(tmp_dir):
-  paths = _maybe_download_corpora(tmp_dir)
-  for path in paths:
-    for story_file in tf.gfile.Glob(path + "*"):
-      story = u""
-      for line in tf.gfile.Open(story_file, "rb"):
-        line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8")
-        story += line
-      yield story
 
+  cnn_files = tf.gfile.Glob(cnn_finalpath + "*")
+  dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*")
+  all_files = cnn_files + dailymail_files
+
+  if is_training:
+    urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt", _TRAIN_URLS)
+  else:
+    urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt", _DEV_URLS)
+
+  return all_files, urls_path
+
+def example_splits(url_file, all_files):
+  def generate_hash(inp):
+      """Generate a sha1 hash to match the raw url to the filename extracted"""
+      h = hashlib.sha1()
+      h.update(inp)
+      return h.hexdigest()
+
+  all_files_map = {f.split("/")[-1]:f for f in all_files}
+
+  urls = []
+  for line in tf.gfile.Open(url_file):
+    urls.append(line.strip().encode('utf-8'))
+
+  filelist = []
+  for url in urls:
+      url_hash = generate_hash(url)
+      filename = url_hash + ".story"
+      if filename not in all_files_map:
+        tf.logging.info("Missing file: %s" % url)
+        continue
+      filelist.append(all_files_map[filename])
+
+  tf.logging.info("Found %d examples" % len(filelist))
+
+  return filelist
+
+def example_generator(tmp_dir, is_training, sum_token):
+  def fix_run_on_sents(line):
+    if u"@highlight" in line: return line
+    if line=="": return line
+    if line[-1] in END_TOKENS: return line
+    return line + u"."
+
+  all_files, urls_path = _maybe_download_corpora(tmp_dir, is_training)
+  filelist = example_splits(urls_path, all_files)
+  story_summary_split_token = u" <summary> " if sum_token else " "
+
+  for story_file in filelist:
+    story = []
+    summary = []
+    reading_highlights = False
+    for line in tf.gfile.Open(story_file, "rb"):
+      line = unicode(line.strip(), "utf-8") if six.PY2 else line.strip().decode("utf-8")
+      line = fix_run_on_sents(line)
+      if line == "":
+          continue
+      elif line.startswith(u"@highlight"):
+          if len(story) == 0: break # No article text
+          reading_highlights = True
+      elif reading_highlights:
+          summary.append(line)
+      else:
+          story.append(line)
+
+    if len(story) == 0 or len(summary) == 0:
+        continue
+
+    yield " ".join(story) + story_summary_split_token + " ".join(summary)
 
 def _story_summary_split(story):
-  end_pos = story.find("\n\n")  # Upto first empty line.
-  assert end_pos != -1
-  return story[:end_pos], story[end_pos:].strip()
+  split_str = u" <summary> "
+  split_str_len = len(split_str)
+  split_pos = story.find(split_str)
+  return story[:split_pos], story[split_pos+split_str_len:] # story, summary
 
 
 @registry.register_problem
 class SummarizeCnnDailymail32k(problem.Text2TextProblem):
-  """Summarize CNN and Daily Mail articles to their first paragraph."""
+  """Summarize CNN and Daily Mail articles to their summary highlights."""
 
   @property
   def is_character_level(self):
@@ -124,14 +195,14 @@ def targeted_vocab_size(self):
 
   @property
   def use_train_shards_for_dev(self):
-    return True
+    return False
 
-  def generator(self, data_dir, tmp_dir, _):
+  def generator(self, data_dir, tmp_dir, is_training):
     encoder = generator_utils.get_or_generate_vocab_inner(
         data_dir, self.vocab_file, self.targeted_vocab_size,
-        story_generator(tmp_dir))
-    for story in story_generator(tmp_dir):
-      summary, rest = _story_summary_split(story)
+        example_generator(tmp_dir, is_training, sum_token=False))
+    for example in example_generator(tmp_dir, is_training, sum_token=True):
+      story, summary = _story_summary_split(example)
       encoded_summary = encoder.encode(summary) + [EOS]
-      encoded_story = encoder.encode(rest) + [EOS]
+      encoded_story = encoder.encode(story) + [EOS]
       yield {"inputs": encoded_story, "targets": encoded_summary}