google-research · RishabGoel · Oct 27, 2021 · Nov 10, 2021 · Nov 10, 2021 · Nov 24, 2021
diff --git a/config/default.py b/config/default.py
@@ -23,6 +23,8 @@ def default_config():
   config.use_in_dataset_field = True
 
   # Training configs
+  config.train_steps = 0  # 0 means run forever.
+  config.seed = 0
   config.optimizer = 'adam'  # sgd, adam
   config.learning_rate = 0.03
   config.grad_clip_value: float = 0.0  # 0 means no clipping.
@@ -48,6 +50,11 @@ def default_config():
   config.compressive_max_skip = 10
   config.compressive_mask_maker = 'default'
 
+  # GGNN Configs
+  config.ggnn_use_exit_node_embedding = False
+  config.ggnn_use_fixed_num_layers = True
+  config.ggnn_layers = 3
+
   # Dataset filtering and configs
   config.epochs: Optional[int] = 0
   config.batch_size: int = 128

diff --git a/core/data/codenet_paths.py b/core/data/codenet_paths.py
@@ -5,25 +5,28 @@
 import time
 
 DEFAULT_CONFIG_PATH = 'config/default.py'
-DEFAULT_DATASET_PATH = 'datasets/codenet/2021-12-06-f=0.01'
-TEST_DATASET_PATH = 'datasets/codenet/2021-12-22-f=0.01'
+DEFAULT_DATASET_PATH = 'datasets/codenet/2021-12-09-f=0.01'
+TEST_DATASET_PATH = 'datasets/codenet/2021-12-29-f=0.01'
 DEFAULT_TOKENIZER_PATH = 'out/tokenizers/train-1000000.json'
 DOCSTRING_TOKENIZER_PATH = 'out/tokenizers/train-docstrings-1000000.json'
 DEFAULT_SPLITS_PATH = 'out/splits/default.json'
 DEFAULT_EXPERIMENTS_DIR = 'out/experiments'
 EXPERIMENT_ID_PATH = 'out/experiment_id.txt'
 
-FULL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-22-nodoc'
-FULL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-22'
-SMALL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-23-nodoc-f=0.1'
-SMALL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-23-f=0.1'
+FULL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-29-nodoc'
+FULL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-29'
+SMALL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-29-nodoc-f=0.1'
+SMALL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-29-f=0.1'
 # Raw control_flow_programs data pattern:
 DEFAULT_CFP_DATA_PATTERN = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs/decimal-large-state-L10/0.0.48/control_flow_programs-train.tfrecord-*'
 # Processed control_flow_programs dataset path:
 DEFAULT_CFP_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs/processed/decimal-large-state-L10/0.0.48-002/'
 
-RAW_CFP_RAISE_DATA_PATH = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs_raise/decimal-large-state-L30/2021-10-19-001/synthetic-20211018-001.tfrecord'
-DEFAULT_CFP_RAISE_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs_raise/processed/decimal-large-state-L30/2021-10-19-001/'
+RAW_CFP_RAISE_DATA_PATH = '/mnt/error-prediction-synthetic-data/synthetic-2021-12-29-001.tfrecord'
+DEFAULT_CFP_RAISE_DATASET_PATH = '/mnt/error-prediction-synthetic-data/synthetic-2021-12-29-001/'
+# RAW_CFP_SYNTH_FULL_DATASET_PATH = '/mnt/error-prediction-synthetic-data/synthetic-2021-11-19-001/errors-only'
+# RAW_CFP_SYNTH_FULL_DATASET_PATH = '/mnt/error-prediction-synthetic-data/synthetic-2022-01-01-002/errors-only'
+
 
 DATA_ROOT = '/mnt/disks/project-codenet-data/Project_CodeNet/'
 OUT_ROOT = '/mnt/disks/project-codenet-data/out/'
@@ -60,6 +63,10 @@
   DATA_ROOT = '/mnt/project-codenet-storage/Project_CodeNet/'
   EVALS_ROOT = '/mnt/project-codenet-storage/out/evals'
   OUT_ROOT = '/mnt/project-codenet-storage/out'
+elif HOSTNAME== 'Rishabs-MacBook-Air.local':
+  RAW_CFP_RAISE_DATA_PATH = 'tmp.tfrecord'
+  DEFAULT_CFP_RAISE_DATASET_PATH = './tmp/'
+  PERSONAL_ACCESS_TOKEN_PATH = '/Users/rishabgoel/Documents/pt.txt'
 
 # On TPUs, this we mount the GCS bucket "runtime-error-problems-experiments"
 # at /mnt/runtime-error-problems-experiments.

diff --git a/core/data/data_io.py b/core/data/data_io.py
@@ -46,7 +46,7 @@ def to_tf_example(problem):
       'in_dataset': _int64_feature([problem.in_dataset]),
       'num_tokens': _int64_feature([len(problem.tokens)]),
       'num_nodes': _int64_feature([len(problem.true_branch_nodes)]),
-      'num_edges': _int64_feature([len(problem.edge_sources)]),
+      'num_edges': _int64_feature([problem.num_edges]),
   }))
 
 
@@ -88,16 +88,18 @@ def decode_fn(record_bytes, include_strings=False):
       example['post_domination_matrix'],
       example['post_domination_matrix_shape']
   )
+  example['edge_sources_shape'] = tf.shape(example['edge_sources'])
   return example
 
 
 def get_fake_input(batch_size, max_tokens, max_num_nodes, max_num_edges):
   return {
       'tokens': jnp.ones((batch_size, max_tokens), dtype=jnp.int32),
       'docstring_tokens': jnp.ones((batch_size, max_tokens), dtype=jnp.int32),
-      'edge_sources': jnp.zeros((batch_size, max_num_edges), dtype=jnp.int32),
-      'edge_dests': jnp.ones((batch_size, max_num_edges), dtype=jnp.int32),
-      'edge_types': jnp.zeros((batch_size, max_num_edges), dtype=jnp.int32),
+      'edge_sources': jnp.zeros((batch_size, 2 * max_num_edges + 4), dtype=jnp.int32),
+      'edge_dests': jnp.ones((batch_size, 2 * max_num_edges + 4), dtype=jnp.int32),
+      'edge_types': jnp.zeros((batch_size, 2 * max_num_edges + 4), dtype=jnp.int32),
+      'edge_sources_shape': jnp.full((batch_size, 1), 2 * max_num_edges + 4, dtype=jnp.int32),
       'node_token_span_starts': jnp.zeros((batch_size, max_num_nodes), dtype=jnp.int32),
       'node_token_span_ends': jnp.ones((batch_size, max_num_nodes), dtype=jnp.int32),
       'true_branch_nodes': jnp.ones((batch_size, max_num_nodes), dtype=jnp.int32),
@@ -132,9 +134,10 @@ def get_padded_shapes(max_tokens, max_num_nodes, max_num_edges, include_strings=
   shapes = {
       'tokens': [max_tokens],
       'docstring_tokens': [max_tokens],
-      'edge_sources': [max_num_edges],
-      'edge_dests': [max_num_edges],
-      'edge_types': [max_num_edges],
+      'edge_sources': [2 * max_num_edges + 6],
+      'edge_dests': [2 * max_num_edges + 6],
+      'edge_types': [2 * max_num_edges + 6],
+      'edge_sources_shape': [1],  # Added in trainer.py.
       'node_token_span_starts': [max_num_nodes],
       'node_token_span_ends': [max_num_nodes],
       'token_node_indexes': [max_tokens],
@@ -243,6 +246,9 @@ def load_dataset(dataset_path=codenet_paths.DEFAULT_DATASET_PATH, split='train',
         for i in split_ranges[split]
     ]
     return load_tfrecords_dataset(tfrecord_paths, include_strings=include_strings)
+  elif 'errors-only' in dataset_path or 'errors-L2E' in dataset_path:
+    tfrecord_path = codenet_paths.make_tfrecord_path(dataset_path, split)
+    return load_tfrecord_dataset(tfrecord_path, include_strings=include_strings)
   else:
     tfrecord_path = codenet_paths.make_tfrecord_path(dataset_path, split)
     return load_tfrecord_dataset(tfrecord_path, include_strings=include_strings)

diff --git a/core/data/generation/constants.py b/core/data/generation/constants.py
@@ -0,0 +1 @@
+INDENT_STRING = '  '
diff --git a/core/data/generation/generate.py b/core/data/generation/generate.py
@@ -0,0 +1,210 @@
+"""Generates Control Flow Programs.
+
+This file was introduced as part of the Exception IPA-GNN effort, for generating
+a new dataset suitable for testing the vanilla IPA-GNN and Exception IPA-GNN.
+"""
+
+import collections
+import dataclasses
+import os
+import random
+from typing import Optional, Sequence, Text, Tuple
+
+from absl import app
+from python_graphs import control_flow
+import tensorflow as tf
+import tqdm
+
+from core.data import codenet_paths
+from core.data import process
+from core.data.generation import program_generator
+from core.data.generation import python_interpreter
+
+TFRECORD_PATH = codenet_paths.RAW_CFP_RAISE_DATA_PATH
+TFRECORD_PATH = 'tmp-002.tfrecord'
+ASSERTION_ERROR_PROB = 0.5
+ADD_ASSERTION_ERRO = True
+
+DEFAULT_OPS = ("+=", "-=", "*=")
+
+
+@dataclasses.dataclass
+class ArithmeticIfRepeatsConfig:
+  """Config for ArithmeticIfRepeats ProgramGenerator.
+
+  Attributes:
+    base: The base to represent the integers in.
+    length: The number of statements in the generated programs.
+    num_digits: The number of digits in the values used by the programs.
+    max_repeat_statements: The maximum number of repeat statements allowed in
+      a program.
+    max_repetitions: The maximum number of repetitions a repeat statement may
+      specify.
+    repeat_probability: The probability that a given statement is a repeat
+      statement, provided a repeat statement is possible at that location.
+    max_if_statements: The maximum number of if statements allowed in a program.
+    if_probability: The probability that a given statement is an if statement,
+      provided an if statement is possible at that location.
+    ifelse_probability: The probability that a given statement is an if-else
+      statement, provided an if statement is possible at that location.
+    max_nesting: The maximum depth of nesting permitted, or None if no limit.
+    max_block_size: The maximum number of statements permitted in a block.
+    ops: The ops allowed in the generated programs.
+    encoder_name: The encoder name to use to encode the generated programs.
+    mod: The value (if any) to mod the intermediate values of the program by
+      after each step of execution.
+    output_mod: The value (if any) to mod the final values of the program by.
+  """
+  base: int
+  length: int
+  num_digits: int = 1
+  max_repeat_statements: Optional[int] = 2
+  max_repetitions: int = 9
+  repeat_probability: float = 0.1
+  max_if_statements: Optional[int] = 2
+  if_probability: float = 0.2
+  ifelse_probability: float = 0.2
+  max_nesting: Optional[int] = None
+  max_block_size: Optional[int] = 9
+  ops: Tuple[Text, ...] = DEFAULT_OPS
+  encoder_name: Text = "simple"
+  mod: Optional[int] = 10
+  output_mod: Optional[int] = None
+
+
+
+def int64_feature(value):
+  """Constructs a tf.train.Feature for the given int64 value list."""
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def bytes_feature(values):
+  """Constructs a tf.train.Feature for the given str value list."""
+  values = [v.encode('utf-8') for v in values]
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))
+
+
+def to_tf_example(source, target, steps):
+  """Constructs a tf.train.Example for the source code."""
+  return tf.train.Example(features=tf.train.Features(feature={
+      'source': bytes_feature([source]),
+      'target': bytes_feature([target]),
+      'steps': int64_feature([steps]),
+  }))
+
+
+def decode_fn(record_bytes):
+  features = {
+      'source': tf.io.FixedLenFeature([1], dtype=tf.string),
+      'target': tf.io.FixedLenFeature([1], dtype=tf.string),
+      'steps': tf.io.FixedLenFeature([1], dtype=tf.int64),
+  }
+  return tf.io.parse_single_example(record_bytes, features)
+
+
+def load_dataset(tfrecord_paths):
+  return tf.data.TFRecordDataset(
+      tfrecord_paths,
+      compression_type=None, buffer_size=None, num_parallel_reads=32
+  ).map(decode_fn)
+
+
+def read():
+  for example in load_dataset([TFRECORD_PATH]):
+    source = example['source'].numpy()[0].decode('utf-8')
+    target = example['target'].numpy()[0].decode('utf-8')
+    print(source)
+    print('---')
+    # if 'raise' in source:
+    #   print(target)
+
+
+def generate_example_from_python_source(executor, base, python_source, mod, output_mod):
+  """Generates an example dict from the given statements."""
+  cfg = control_flow.get_control_flow_graph(python_source)
+  python_source_lines = python_source.strip().split("\n")
+
+  values = {"v0": 1}  # Assume v0 starts at 1.
+  try:
+    values = python_interpreter.evaluate_cfg(
+        executor, cfg, mod=mod,
+        initial_values=values,
+        timeout=200)
+    error_type = "NoError"
+  except Exception as e:  # pylint: disable=broad-except
+    error_type = type(e).__name__
+  target_output = values["v0"]
+
+  if output_mod is not None:
+    try:
+      target_output %= output_mod
+    except TypeError:
+      target_output = 1
+
+  return {
+      'human_readable_target_output': str(target_output),
+      'error_type': error_type
+  }
+
+
+def add_assert_error(source, example):
+  if example['error_type'] == 'RuntimeError':
+    return source, example
+  is_error = random.choices([0,1], [1-ASSERTION_ERROR_PROB, ASSERTION_ERROR_PROB])[0]
+  add_val = random.randint(1,10)
+  current_val = int(example['human_readable_target_output'])
+  if is_error:
+    source = f"{source}\nassert v0=={abs(current_val+add_val)%1000}"
+    example['error_type'] = "AssertionError"
+  else:
+    source = f"{source}\nassert v0=={current_val}"
+  return source, example
+
+
+def main(argv: Sequence[str]) -> None:
+  del argv  # Unused.
+
+  # if os.path.exists(TFRECORD_PATH):
+  #   return read()
+  # os.makedirs(TFRECORD_PATH, exist_ok=True)
+
+  executor = python_interpreter.ExecExecutor()
+  counts = collections.Counter()
+  program_generator_config = ArithmeticIfRepeatsConfig(
+      base=10,
+      max_if_statements=5,
+      length=30,
+  )
+  with tf.io.TFRecordWriter(TFRECORD_PATH) as file_writer:
+    for _ in tqdm.tqdm(range(5000000)):
+      source = program_generator.generate_python_source(
+          30, program_generator_config)
+      # print(source)
+      # print()
+
+      example = (
+          generate_example_from_python_source(
+              executor, program_generator_config.base, source,
+              mod=1000,
+              output_mod=1000,
+          )
+      )
+      # print(example)
+
+      source, example = add_assert_error(source, example)
+
+      target = example['human_readable_target_output']
+      error_type = example['error_type']
+      lines = source.split('\n')
+      steps = process.get_step_limit(lines)
+      counts[target] += 1
+
+      if error_type != 'NoError':
+        target = error_type
+      record_bytes = to_tf_example(source, target, steps).SerializeToString()
+      file_writer.write(record_bytes)
+  print(dict(counts))
+
+
+if __name__ == '__main__':
+  app.run(main)