-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathworkflowgen.py
139 lines (112 loc) · 5.85 KB
/
workflowgen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import random
import numpy as np
import pprint
import json
from workflowgen.vizaction import VizAction
from workflowgen.linkaction import LinkAction
from optparse import OptionParser
import pandas as pd
from common.schema import Schema
from common.vizgraph import VizGraph
#from common.storage import Storage
import pandasql
class WorkflowGenerator:
def __init__(self):
parser = OptionParser()
parser.add_option("-r", "--seed", dest="seed", action="store", type=int, help="Random seed", default=25000)
parser.add_option("-d", "--dataset", dest="data_folder", action="store", help="path to save the file", default="flights")
parser.add_option("--debug", dest="debug", action="store_true", help="creates a debug file", default=False)
parser.add_option("-n", "--num-operations", dest="num_operations", action="store", type=int, help="Number of operations to generate", default=20)
parser.add_option("-c", "--workflow-type", dest="config", action="store", help="path to config file", default="data/flights/workflowtypes/sequential.json")
parser.add_option("-p", "--output", dest="path", action="store", help="path to save the file", default="workflow.json")
parser.add_option("-s", "--num-samples", dest="numsamples", action="store", type=int, help="Number of samples to draw from the original dataset", default=10000)
(options, args) = parser.parse_args()
self.options = options
random.seed(options.seed)
np.random.seed(seed=options.seed)
print("data/" + options.data_folder + "/" + options.config)
with open("data/" + options.data_folder + "/workflowtypes/" + options.config, "r") as fp:
self.config = json.load(fp)
schema = None
with open(self.get_schema_path()) as f:
schema = Schema(json.load(f))
print("reading csv...")
# load sample data
df = pd.read_csv("data/" + options.data_folder + "/sample.csv", nrows=options.numsamples, header=0)
#schema = {"tables": [{ "name": "df", "dimensions": []}]}
sample_json = None
with open("data/" + options.data_folder + "/sample.json", "r") as f:
sample_json = json.load(f)
# print(sample_json)
# for field in sample_json["tables"]["fact"]["fields"]:
# schema["tables"][0]["dimensions"].append({"name": field["field"]})
#storage = Storage(schema)
zero_qs_ratio = 100
tries = -1
while zero_qs_ratio > 0.15:
tries += 1
num_zeros_qs = 0
num_qs = 0
VizAction.VIZ_COUNTER = -1
LinkAction.FIRST_LINK = None
LinkAction.LATEST_LINK = None
LinkAction.LINKS = set()
vizgraph = VizGraph()
random.seed(options.seed + tries)
root = VizAction(self.config, df, vizgraph, schema, sample_json)
current = root
states = []
num_ops = 0
debug_states = []
while num_ops < options.num_operations:
res = current.get_states()
if res:
affected_vizs = vizgraph.apply_interaction(res)
if options.debug:
nodes_dict = vizgraph.get_nodes_dict()
states_dict = {}
for n in nodes_dict.keys():
states_dict[n] = {
"name":n,
"source" : nodes_dict[n].get_source(),
"binning": nodes_dict[n].binning,
"agg": nodes_dict[n].per_bin_aggregates,
"selection": nodes_dict[n].get_selection(),
"filter": nodes_dict[n].get_filter(),
"computed_filter": nodes_dict[n].get_computed_filter_as_sql(schema),
}
debug_states.append(states_dict)
for x in affected_vizs:
sql = x.get_computed_filter_as_sql(schema).replace("FLOOR", "ROUND").replace(schema.get_fact_table_name(), "df")
r = pandasql.sqldf(sql, locals())
num_qs += 1
if len(r.index) == 0:
num_zeros_qs += 1
#print("ZERO QUERY")
states.append(res.data)
#print(res.data)
#if "source" not in res:
num_ops += 1
current = current.get_next()
if current is None:
zero_qs_ratio = num_zeros_qs/num_qs
break
zero_qs_ratio = num_zeros_qs/num_qs
print("zero queries:")
print( (num_zeros_qs / num_qs))
with open("data/" + options.data_folder + "/workflows/" + options.path + ".json", "w") as fp:
fp.write(json.dumps({"name": "generated", "dataset": options.data_folder, "seed": options.seed, "config": options.config, "interactions": states}))
print("done.")
#with open("workflowviewer/public/workflow.json", "w") as fp:
# fp.write(json.dumps({"name": "generated", "dataset": options.data_folder, "seed": options.seed, "config": options.config, "interactions": states}))
#with open("workflowviewer/public/workflow_debug.json", "w") as fp:
# fp.write(json.dumps(debug_states))
#if options.debug:
# import webbrowser
# url = "http://localhost:3000"
# webbrowser.open(url)
def get_schema_path(self):
return "data/%s/sample.json" % (self.options.data_folder)
def get_viz_name(self):
return "viz_%i" % self.config["viz_counter"]
WorkflowGenerator()