Skip to content
This repository was archived by the owner on Jan 22, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
ead2c68
Data generation
dbieber Oct 27, 2021
356b9cd
added the function for assertion error
RishabGoel Nov 10, 2021
eac4d07
fixed a bug
RishabGoel Nov 10, 2021
a351c69
generate data for training
RishabGoel Nov 24, 2021
d0a26c9
Data generation
dbieber Oct 27, 2021
db39389
added the function for assertion error
RishabGoel Nov 10, 2021
0621ae1
fixed a bug
RishabGoel Nov 10, 2021
773d5d0
rebased branch
RishabGoel Dec 6, 2021
5136ddf
added speciifc dataset handling
RishabGoel Dec 7, 2021
9e1fd39
added the run configs
RishabGoel Dec 7, 2021
ec94a0d
removed a bug
RishabGoel Dec 15, 2021
844225d
changed config
RishabGoel Jan 3, 2022
3b3abbc
changed filepath
RishabGoel Jan 3, 2022
ecb3cec
reveted the create dir
RishabGoel Jan 3, 2022
fb8f205
resolved merge conflict
RishabGoel Jan 3, 2022
9fb6a39
GGNN baseline (#87)
dbieber Jan 3, 2022
a4d0fcf
Enable dry-run sweeps. (#89)
dbieber Jan 4, 2022
25bf3dc
Support runs for computing metric variances (#90)
dbieber Jan 6, 2022
713594e
GGNN experiments (#91)
dbieber Jan 10, 2022
09c690f
Restore logic for test (#93)
dbieber Jan 11, 2022
bb5b77f
Data generation
dbieber Oct 27, 2021
1435e5f
added the function for assertion error
RishabGoel Nov 10, 2021
4736076
fixed a bug
RishabGoel Nov 10, 2021
a8365c7
Data generation
dbieber Oct 27, 2021
6080a69
added the function for assertion error
RishabGoel Nov 10, 2021
5213ee8
generate data for training
RishabGoel Nov 24, 2021
4e7b0df
added speciifc dataset handling
RishabGoel Dec 7, 2021
063156f
added the run configs
RishabGoel Dec 7, 2021
29b19c8
removed a bug
RishabGoel Dec 15, 2021
2a5f3f4
changed config
RishabGoel Jan 3, 2022
ac5fd95
changed filepath
RishabGoel Jan 3, 2022
56e0f1d
reveted the create dir
RishabGoel Jan 3, 2022
6c5d41a
resolved merge conflict
RishabGoel Jan 3, 2022
66382e2
resolved merge conflicts
RishabGoel Jan 15, 2022
ac2fce5
added correct paths
RishabGoel Jan 15, 2022
5419d25
debugged directory creation
RishabGoel Jan 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def default_config():
config.use_in_dataset_field = True

# Training configs
config.train_steps = 0 # 0 means run forever.
config.seed = 0
config.optimizer = 'adam' # sgd, adam
config.learning_rate = 0.03
config.grad_clip_value: float = 0.0 # 0 means no clipping.
Expand All @@ -48,6 +50,11 @@ def default_config():
config.compressive_max_skip = 10
config.compressive_mask_maker = 'default'

# GGNN Configs
config.ggnn_use_exit_node_embedding = False
config.ggnn_use_fixed_num_layers = True
config.ggnn_layers = 3

# Dataset filtering and configs
config.epochs: Optional[int] = 0
config.batch_size: int = 128
Expand Down
23 changes: 15 additions & 8 deletions core/data/codenet_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,28 @@
import time

DEFAULT_CONFIG_PATH = 'config/default.py'
DEFAULT_DATASET_PATH = 'datasets/codenet/2021-12-06-f=0.01'
TEST_DATASET_PATH = 'datasets/codenet/2021-12-22-f=0.01'
DEFAULT_DATASET_PATH = 'datasets/codenet/2021-12-09-f=0.01'
TEST_DATASET_PATH = 'datasets/codenet/2021-12-29-f=0.01'
DEFAULT_TOKENIZER_PATH = 'out/tokenizers/train-1000000.json'
DOCSTRING_TOKENIZER_PATH = 'out/tokenizers/train-docstrings-1000000.json'
DEFAULT_SPLITS_PATH = 'out/splits/default.json'
DEFAULT_EXPERIMENTS_DIR = 'out/experiments'
EXPERIMENT_ID_PATH = 'out/experiment_id.txt'

FULL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-22-nodoc'
FULL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-22'
SMALL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-23-nodoc-f=0.1'
SMALL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-23-f=0.1'
FULL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-29-nodoc'
FULL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-29'
SMALL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-29-nodoc-f=0.1'
SMALL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-29-f=0.1'
# Raw control_flow_programs data pattern:
DEFAULT_CFP_DATA_PATTERN = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs/decimal-large-state-L10/0.0.48/control_flow_programs-train.tfrecord-*'
# Processed control_flow_programs dataset path:
DEFAULT_CFP_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs/processed/decimal-large-state-L10/0.0.48-002/'

RAW_CFP_RAISE_DATA_PATH = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs_raise/decimal-large-state-L30/2021-10-19-001/synthetic-20211018-001.tfrecord'
DEFAULT_CFP_RAISE_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs_raise/processed/decimal-large-state-L30/2021-10-19-001/'
RAW_CFP_RAISE_DATA_PATH = '/mnt/error-prediction-synthetic-data/synthetic-2021-12-29-001.tfrecord'
DEFAULT_CFP_RAISE_DATASET_PATH = '/mnt/error-prediction-synthetic-data/synthetic-2021-12-29-001/'
# RAW_CFP_SYNTH_FULL_DATASET_PATH = '/mnt/error-prediction-synthetic-data/synthetic-2021-11-19-001/errors-only'
# RAW_CFP_SYNTH_FULL_DATASET_PATH = '/mnt/error-prediction-synthetic-data/synthetic-2022-01-01-002/errors-only'


DATA_ROOT = '/mnt/disks/project-codenet-data/Project_CodeNet/'
OUT_ROOT = '/mnt/disks/project-codenet-data/out/'
Expand Down Expand Up @@ -60,6 +63,10 @@
DATA_ROOT = '/mnt/project-codenet-storage/Project_CodeNet/'
EVALS_ROOT = '/mnt/project-codenet-storage/out/evals'
OUT_ROOT = '/mnt/project-codenet-storage/out'
elif HOSTNAME== 'Rishabs-MacBook-Air.local':
RAW_CFP_RAISE_DATA_PATH = 'tmp.tfrecord'
DEFAULT_CFP_RAISE_DATASET_PATH = './tmp/'
PERSONAL_ACCESS_TOKEN_PATH = '/Users/rishabgoel/Documents/pt.txt'

# On TPUs, this we mount the GCS bucket "runtime-error-problems-experiments"
# at /mnt/runtime-error-problems-experiments.
Expand Down
20 changes: 13 additions & 7 deletions core/data/data_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def to_tf_example(problem):
'in_dataset': _int64_feature([problem.in_dataset]),
'num_tokens': _int64_feature([len(problem.tokens)]),
'num_nodes': _int64_feature([len(problem.true_branch_nodes)]),
'num_edges': _int64_feature([len(problem.edge_sources)]),
'num_edges': _int64_feature([problem.num_edges]),
}))


Expand Down Expand Up @@ -88,16 +88,18 @@ def decode_fn(record_bytes, include_strings=False):
example['post_domination_matrix'],
example['post_domination_matrix_shape']
)
example['edge_sources_shape'] = tf.shape(example['edge_sources'])
return example


def get_fake_input(batch_size, max_tokens, max_num_nodes, max_num_edges):
return {
'tokens': jnp.ones((batch_size, max_tokens), dtype=jnp.int32),
'docstring_tokens': jnp.ones((batch_size, max_tokens), dtype=jnp.int32),
'edge_sources': jnp.zeros((batch_size, max_num_edges), dtype=jnp.int32),
'edge_dests': jnp.ones((batch_size, max_num_edges), dtype=jnp.int32),
'edge_types': jnp.zeros((batch_size, max_num_edges), dtype=jnp.int32),
'edge_sources': jnp.zeros((batch_size, 2 * max_num_edges + 4), dtype=jnp.int32),
'edge_dests': jnp.ones((batch_size, 2 * max_num_edges + 4), dtype=jnp.int32),
'edge_types': jnp.zeros((batch_size, 2 * max_num_edges + 4), dtype=jnp.int32),
'edge_sources_shape': jnp.full((batch_size, 1), 2 * max_num_edges + 4, dtype=jnp.int32),
'node_token_span_starts': jnp.zeros((batch_size, max_num_nodes), dtype=jnp.int32),
'node_token_span_ends': jnp.ones((batch_size, max_num_nodes), dtype=jnp.int32),
'true_branch_nodes': jnp.ones((batch_size, max_num_nodes), dtype=jnp.int32),
Expand Down Expand Up @@ -132,9 +134,10 @@ def get_padded_shapes(max_tokens, max_num_nodes, max_num_edges, include_strings=
shapes = {
'tokens': [max_tokens],
'docstring_tokens': [max_tokens],
'edge_sources': [max_num_edges],
'edge_dests': [max_num_edges],
'edge_types': [max_num_edges],
'edge_sources': [2 * max_num_edges + 6],
'edge_dests': [2 * max_num_edges + 6],
'edge_types': [2 * max_num_edges + 6],
'edge_sources_shape': [1], # Added in trainer.py.
'node_token_span_starts': [max_num_nodes],
'node_token_span_ends': [max_num_nodes],
'token_node_indexes': [max_tokens],
Expand Down Expand Up @@ -243,6 +246,9 @@ def load_dataset(dataset_path=codenet_paths.DEFAULT_DATASET_PATH, split='train',
for i in split_ranges[split]
]
return load_tfrecords_dataset(tfrecord_paths, include_strings=include_strings)
elif 'errors-only' in dataset_path or 'errors-L2E' in dataset_path:
tfrecord_path = codenet_paths.make_tfrecord_path(dataset_path, split)
return load_tfrecord_dataset(tfrecord_path, include_strings=include_strings)
else:
tfrecord_path = codenet_paths.make_tfrecord_path(dataset_path, split)
return load_tfrecord_dataset(tfrecord_path, include_strings=include_strings)
Expand Down
1 change: 1 addition & 0 deletions core/data/generation/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
INDENT_STRING = ' '
210 changes: 210 additions & 0 deletions core/data/generation/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""Generates Control Flow Programs.

This file was introduced as part of the Exception IPA-GNN effort, for generating
a new dataset suitable for testing the vanilla IPA-GNN and Exception IPA-GNN.
"""

import collections
import dataclasses
import os
import random
from typing import Optional, Sequence, Text, Tuple

from absl import app
from python_graphs import control_flow
import tensorflow as tf
import tqdm

from core.data import codenet_paths
from core.data import process
from core.data.generation import program_generator
from core.data.generation import python_interpreter

TFRECORD_PATH = codenet_paths.RAW_CFP_RAISE_DATA_PATH
TFRECORD_PATH = 'tmp-002.tfrecord'
ASSERTION_ERROR_PROB = 0.5
ADD_ASSERTION_ERRO = True

DEFAULT_OPS = ("+=", "-=", "*=")


@dataclasses.dataclass
class ArithmeticIfRepeatsConfig:
"""Config for ArithmeticIfRepeats ProgramGenerator.

Attributes:
base: The base to represent the integers in.
length: The number of statements in the generated programs.
num_digits: The number of digits in the values used by the programs.
max_repeat_statements: The maximum number of repeat statements allowed in
a program.
max_repetitions: The maximum number of repetitions a repeat statement may
specify.
repeat_probability: The probability that a given statement is a repeat
statement, provided a repeat statement is possible at that location.
max_if_statements: The maximum number of if statements allowed in a program.
if_probability: The probability that a given statement is an if statement,
provided an if statement is possible at that location.
ifelse_probability: The probability that a given statement is an if-else
statement, provided an if statement is possible at that location.
max_nesting: The maximum depth of nesting permitted, or None if no limit.
max_block_size: The maximum number of statements permitted in a block.
ops: The ops allowed in the generated programs.
encoder_name: The encoder name to use to encode the generated programs.
mod: The value (if any) to mod the intermediate values of the program by
after each step of execution.
output_mod: The value (if any) to mod the final values of the program by.
"""
base: int
length: int
num_digits: int = 1
max_repeat_statements: Optional[int] = 2
max_repetitions: int = 9
repeat_probability: float = 0.1
max_if_statements: Optional[int] = 2
if_probability: float = 0.2
ifelse_probability: float = 0.2
max_nesting: Optional[int] = None
max_block_size: Optional[int] = 9
ops: Tuple[Text, ...] = DEFAULT_OPS
encoder_name: Text = "simple"
mod: Optional[int] = 10
output_mod: Optional[int] = None



def int64_feature(value):
"""Constructs a tf.train.Feature for the given int64 value list."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def bytes_feature(values):
"""Constructs a tf.train.Feature for the given str value list."""
values = [v.encode('utf-8') for v in values]
return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))


def to_tf_example(source, target, steps):
"""Constructs a tf.train.Example for the source code."""
return tf.train.Example(features=tf.train.Features(feature={
'source': bytes_feature([source]),
'target': bytes_feature([target]),
'steps': int64_feature([steps]),
}))


def decode_fn(record_bytes):
features = {
'source': tf.io.FixedLenFeature([1], dtype=tf.string),
'target': tf.io.FixedLenFeature([1], dtype=tf.string),
'steps': tf.io.FixedLenFeature([1], dtype=tf.int64),
}
return tf.io.parse_single_example(record_bytes, features)


def load_dataset(tfrecord_paths):
return tf.data.TFRecordDataset(
tfrecord_paths,
compression_type=None, buffer_size=None, num_parallel_reads=32
).map(decode_fn)


def read():
for example in load_dataset([TFRECORD_PATH]):
source = example['source'].numpy()[0].decode('utf-8')
target = example['target'].numpy()[0].decode('utf-8')
print(source)
print('---')
# if 'raise' in source:
# print(target)


def generate_example_from_python_source(executor, base, python_source, mod, output_mod):
"""Generates an example dict from the given statements."""
cfg = control_flow.get_control_flow_graph(python_source)
python_source_lines = python_source.strip().split("\n")

values = {"v0": 1} # Assume v0 starts at 1.
try:
values = python_interpreter.evaluate_cfg(
executor, cfg, mod=mod,
initial_values=values,
timeout=200)
error_type = "NoError"
except Exception as e: # pylint: disable=broad-except
error_type = type(e).__name__
target_output = values["v0"]

if output_mod is not None:
try:
target_output %= output_mod
except TypeError:
target_output = 1

return {
'human_readable_target_output': str(target_output),
'error_type': error_type
}


def add_assert_error(source, example):
if example['error_type'] == 'RuntimeError':
return source, example
is_error = random.choices([0,1], [1-ASSERTION_ERROR_PROB, ASSERTION_ERROR_PROB])[0]
add_val = random.randint(1,10)
current_val = int(example['human_readable_target_output'])
if is_error:
source = f"{source}\nassert v0=={abs(current_val+add_val)%1000}"
example['error_type'] = "AssertionError"
else:
source = f"{source}\nassert v0=={current_val}"
return source, example


def main(argv: Sequence[str]) -> None:
del argv # Unused.

# if os.path.exists(TFRECORD_PATH):
# return read()
# os.makedirs(TFRECORD_PATH, exist_ok=True)

executor = python_interpreter.ExecExecutor()
counts = collections.Counter()
program_generator_config = ArithmeticIfRepeatsConfig(
base=10,
max_if_statements=5,
length=30,
)
with tf.io.TFRecordWriter(TFRECORD_PATH) as file_writer:
for _ in tqdm.tqdm(range(5000000)):
source = program_generator.generate_python_source(
30, program_generator_config)
# print(source)
# print()

example = (
generate_example_from_python_source(
executor, program_generator_config.base, source,
mod=1000,
output_mod=1000,
)
)
# print(example)

source, example = add_assert_error(source, example)

target = example['human_readable_target_output']
error_type = example['error_type']
lines = source.split('\n')
steps = process.get_step_limit(lines)
counts[target] += 1

if error_type != 'NoError':
target = error_type
record_bytes = to_tf_example(source, target, steps).SerializeToString()
file_writer.write(record_bytes)
print(dict(counts))


if __name__ == '__main__':
app.run(main)
Loading