XiangLi1999 · heychhavi · Nov 14, 2023 · Nov 15, 2023 · Nov 16, 2023 · Dec 21, 2023
diff --git a/datasets/ROCstory/roc_valid.json b/datasets/ROCstory/roc_valid.json
diff --git a/datasets/aigeneratedtext.txt b/datasets/aigeneratedtext.txt
@@ -0,0 +1,27 @@
+1	"
+
+Playing for the Michigan State basketball team in the aftermath of a horrific shooting meant that the team had to take on the responsibility of representing the university in a difficult time. It required them to open themselves up to criticism and public scrutiny, but they felt it was important to show resilience and strength in the face of adversity. They wanted to show that, in spite of the tragedy, life goes on."
+2	"
+In 1985, a small plane carrying a load of cocaine crashed in the Chattahoochee National Forest in Georgia. The crash site was discovered by a US Forest Service worker, who found a bear munching on the drugs. The bear had gotten into the drug shipment - which was estimated to be worth over $15 million - and had eaten around 20 pounds of cocaine. The bear was dubbed ""Cocaine Bear,"" and the story quickly became a source of folk legend.
+
+The incident gained even more traction when news outlets started to report on it, and in 2009, the story was adapted into a mockumentary called �Cocaine Bear: The Legend of Elrod,� which was released on YouTube.
+
+Now, the story of the Cocaine Bear is getting the big-screen treatment. The film, ""Cocaine Bear,"" follows the story of a young bear cub named Elrod who is raised by drug smugglers in the 1980s. After the plane carrying a shipment of cocaine crashes in the national forest, Elrod is forced to fend for himself and learns to survive in the wild. The film stars Andrew Lincoln, Jessica Chastain, and Woody Harrelson.
+
+The film is a dark comedy and a"
+3	"
+
+In 1985, a large black bear was discovered in the Chattahoochee National Forest in Georgia with a strange story. The bear had been spotted by hunters, and it was waddling around with a dead man�s body in its mouth. After further investigation, it was discovered that the man had died from a drug overdose and that the bear had been scavenging the body for food. What was even more bizarre was that the bear had managed to open the man�s backpack and consume the contents, which included over 75 pounds of cocaine that had been packed in plastic bags.
+
+The bear was found to have a level of cocaine in its bloodstream that was seven times higher than the lethal dose for humans. It was eventually euthanized due to its extreme agitation and aggression. The animal�s strange tale has since become a popular urban legend, and experts have warned against feeding wild animals, as it can pose a significant risk to both the animals and humans."
+4	"
+
+To make it, start by heating some oil in a large skillet over medium-high heat. Add in a pound of ground pork and cook until no longer pink, breaking it up with a wooden spoon as it cooks. Add in some minced garlic and dried herbs such as oregano, thyme, and rosemary, and season with salt and pepper. Cook for a few minutes until the herbs are fragrant.
+
+Next, add 1 cup of long-grain white rice and stir to combine. Pour in 2 cups of chicken broth and bring to a boil. Reduce the heat to low, cover the skillet, and simmer for about 18 minutes, or until the rice is tender.
+
+Meanwhile, heat 2 tablespoons of butter in a separate skillet over medium heat. Add in a large head of thinly sliced cabbage and saut� for about 10 minutes, stirring occasionally, until golden and caramelized.
+
+Once the pork and rice is done, remove it from the heat and fluff the rice with a fork. To assemble the skillet, layer the cabbage over the bottom of the skillet, followed by the pork and rice mixture. Top with the remaining cabbage and press it down into the mixture with a spoon.
+
+Cover the skillet and cook over".
diff --git a/improved-diffusion/improved_diffusion/text_datasets.py b/improved-diffusion/improved_diffusion/text_datasets.py
@@ -945,4 +945,4 @@ def _torch_collate_batch(examples, pad_token_id, max_length):
             result[i, : example.shape[0]] = example
         else:
             result[i, -example.shape[0] :] = example
-    return result
+    return result
diff --git a/improved-diffusion/scripts/infill.py b/improved-diffusion/scripts/infill.py
@@ -22,7 +22,8 @@
     add_dict_to_argparser,
     args_to_dict,
 )
-sys.path.insert(0, 'diffusion_lm/transformers/examples/pytorch/language-modeling')
+#sys.path.insert(0, 'diffusion_lm/transformers/examples/pytorch/language-modeling')
+sys.path.insert(0, '/kaggle/working/Diffusion-LM/transformers/examples/pytorch/language-modeling')
 from custom_trainer import Classifier_GPT2, Classifier_Times, Classifier_POS, Classifier_Tree
 from infill_util import langevin_fn3, get_score, langevin_fn3_compose, langevin_fn1, langevin_fn4, langevin_fn_tree, langevin_fn_length
 from spacy.lang.en import English
@@ -238,8 +239,7 @@ def main():
                                                0.1)
 
             elif args.eval_task_ == 'control_pos':
-                model_control = Classifier_POS.from_pretrained('predictability/diff_models/e2e-tgt-pos_e=6_b=10_m=bert-'
-                                                               'base-uncased_wikitext-103-raw-v1_101_wp_full_multi16_v2').cuda()
+                model_control = Classifier_POS.from_pretrained('/kaggle/working/Diffusion-LM/classifier_models/').cuda()
 
 
                 pos_vocab = {'START': 0, 'END': 1, 'UNK': 2, 'PAD': 3}
@@ -314,9 +314,10 @@ def main():
                 #     'predictability/diff_models/e2e-tgt-tree_e=20_b=32_m=bert-base-uncased_'
                 #     'wikitext-103-raw-v1_101_wp_full_multi16_v2').cuda()
                 model_control = Classifier_Tree.from_pretrained(
-                    'predictability/diff_models/e2e-tgt-tree_e=20_b=32_m=bert-base-uncased_'
-                    'wikitext-103-raw-v1_101_wp_full_multi16_cat').cuda()
-
+                    '/kaggle/working/Diffusion-LM/classifier_models/').cuda()
+                #model_control = Classifier_Tree.from_pretrained(
+                #    'predictability/diff_models/e2e-tgt-tree_e=20_b=32_m=bert-base-uncased_'
+                #    'wikitext-103-raw-v1_101_wp_full_multi16_cat').cuda()
                 # print(model_control)
 
                 import benepar
@@ -714,7 +715,7 @@ def decode_helper(args, sample_dict, diff_model=None):
 def create_argparser():
     defaults = dict(
         data_dir="", clip_denoised=False, use_ddim=False, eta=1.0, num_samples=50, batch_size=1, model_path="",
-        out_dir="diffusion_lm/improved_diffusion/out_gen",
+        out_dir="/kaggle/working/Diffusion-LM/improved-diffusion/improved_diffusion/",
         emb_scale_factor=1.0, split='train', debug_path='', eval_task_='infill',
         partial_seq="", partial_seq_file="", verbose='yes', tgt_len=15, t_merge=200, interp_coef=0.5, notes='',
         start_idx=0, end_idx=0,
@@ -753,7 +754,7 @@ def eval(args):
     if args.modality == 'e2e-tgt':
         model_name_path = "predictability/diff_models/e2e-tgt_e=15_b=20_m=gpt2_wikitext-103-raw-v1_101_None"
 
-        COMMAND = f"python scripts/ppl_under_ar.py " \
+        COMMAND = f"python /kaggle/working/Diffusion-LM/improved-diffusion/scripts/ppl_under_ar.py " \
               f"--model_path {args.model_path} " \
               f"--modality {args.modality}  --experiment random " \
               f"--model_name_or_path {model_name_path} " \

diff --git a/improved-diffusion/scripts/run_train.py b/improved-diffusion/scripts/run_train.py
@@ -97,7 +97,7 @@
 
     COMMANDLINE = f" OPENAI_LOGDIR={Model_FILE}  " \
                   f"TOKENIZERS_PARALLELISM=false " \
-                  f"python scripts/train.py   " \
+                  f"python /kaggle/working/Diffusion-LM/improved-diffusion/scripts/train.py   " \
                   f"--checkpoint_path {Model_FILE} " \
                   f"--model_arch {args.model_arch} " \
                   f"--modality {args.modality} " \

diff --git a/improved-diffusion/scripts/train.py b/improved-diffusion/scripts/train.py
@@ -5,6 +5,9 @@
 import argparse
 import json, torch, os
 import numpy as np
+import sys
+sys.path.append('/kaggle/working/Diffusion-LM/improved-diffusion')
+
 from improved_diffusion import dist_util, logger
 from improved_diffusion.image_datasets import load_data
 from improved_diffusion.text_datasets import load_data_text

diff --git a/train_run.py b/train_run.py
@@ -32,6 +32,7 @@
     parser.add_argument('--temperature', type=float, default=1., help='')
     parser.add_argument('--weight_decay', type=float, default=0.0, help='')
     parser.add_argument('--percent', type=float, default=1.0, help='')
+    parser.add_argument('--file_path', type=str, default='', help='Path to the input file')
 
     parser.add_argument('--submit', type=str, default='no', help='')
     parser.add_argument('--use_big', type=str, default='no', help='')

diff --git a/transformers/examples/pytorch/language-modeling/run_clm.py b/transformers/examples/pytorch/language-modeling/run_clm.py
@@ -33,6 +33,12 @@
 import stanza
 import spacy_stanza
 from datasets import load_dataset, load_metric
+import benepar
+# Download the benepar model
+benepar.download('benepar_en3')
+
+# Initialize the parser
+parser = benepar.Parser("benepar_en3")
 
 import transformers
 from transformers import (
@@ -112,15 +118,15 @@ class ModelArguments:
         metadata={"help": "blcok or pad"},
     )
     roc_train: Optional[str] = field(
-        default='/juice/scr/xlisali/diffusion_lm/ROCstory',
+        default='/kaggle/working/Diffusion-LM/datasets/ROCstory/',
         metadata={"help": "roc story path"},
     )
     wiki_train: Optional[str] = field(
         default='/u/scr/xlisali/diffusion_lm/simple_wiki/data.v1.split/simple.training.txt',
         metadata={"help": "simple wiki path"},
     )
     e2e_train: Optional[str] = field(
-        default='/u/scr/xlisali/e2e_data',
+        default='datasets/e2e_data/',
         metadata={"help": "simple wiki path"},
     )
 
@@ -516,6 +522,7 @@ def main():
         datefmt="%m/%d/%Y %H:%M:%S",
         handlers=[logging.StreamHandler(sys.stdout)],
     )
+
 
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
@@ -603,7 +610,7 @@ def main():
 
         if model_args.experiment.startswith('roc'):
             tokenizer = load_tokenizer('roc', 'random',
-                                       '/u/scr/nlp/xlisali/predictability/diffusion_models_v7/diff_roc_pad_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd108_xstart')
+                                       '/kaggle/working/diffusion_models/diff_roc_block_rand16_transformer_lr0.0001_0.0_200_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_roc')
             vocab = {v: k for k, v in tokenizer.items()}
             print(len(tokenizer), len(vocab), 'loaded vocabs')
 
@@ -905,7 +912,7 @@ def main():
 
             filename = model_args.init_emb  # '/u/scr/nlp/xlisali/predictability/diffusion_models_v3/diff_e2e-tgt_block_rand16_transformer_lr0.0001_2000_cosine_Lsimple_h128_s2_sd101'
             path_save = '{}/random_emb.torch'.format(filename)
-            path_learned = '{}/ema_0.9999_200000.pt'.format(filename)
+            path_learned = '{}/ema_0.9999_000200.pt'.format(filename)
             if model_args.experiment == 'e2e-tgt-pos' and model_args.learned_emb == 'no':
                 model.transformer.embeddings.word_embeddings.load_state_dict(torch.load(path_save))
                 model.transformer.embeddings.word_embeddings.weight.requires_grad = False
@@ -1050,6 +1057,13 @@ def tokenize_function(examples):
                 doc = nlp(sent_full)
                 doc_token_pos = [(token.text, token.pos_,) for token in doc]
                 len_lst = [len(seq) for seq in examples['text']]
+
+                # Debug prints to check alignment
+                print("Original Sentences:", examples['text'])
+                print("Parsed Tokens and POS:", doc_token_pos)
+                print("Token Lengths:", len_lst)
+
+                #assert sum(len_lst) == len(doc_token_pos)
                 # print(sum(len_lst),  len(doc_token_pos))
                 assert sum(len_lst) == len(doc_token_pos)
                 pos_lst = []