Skip to content

Commit 1c1dd67

Browse files
authored
Sort processing order & parse json config file (#13)
Sort the processing order of users/versions (helps debugging) Moved from command line arguments for each script to parsing the config json
1 parent 4c25f0c commit 1c1dd67

File tree

5 files changed

+108
-72
lines changed

5 files changed

+108
-72
lines changed

bin/concatenate_all.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818

1919
def parse_args():
2020
parser = argparse.ArgumentParser(description="")
21-
parser.add_argument("semester")
22-
parser.add_argument("course")
23-
parser.add_argument("gradeable")
21+
parser.add_argument("config_path")
2422
return parser.parse_args()
2523

2624

@@ -30,29 +28,35 @@ def main():
3028
sys.stdout.write("CONCATENATE ALL...")
3129
sys.stdout.flush()
3230

31+
with open(args.config_path) as lichen_config:
32+
lichen_config_data = json.load(lichen_config)
33+
semester = lichen_config_data["semester"]
34+
course = lichen_config_data["course"]
35+
gradeable = lichen_config_data["gradeable"]
36+
3337
# ===========================================================================
3438
# error checking
35-
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
39+
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
3640
if not os.path.isdir(course_dir):
3741
print("ERROR! ",course_dir," is not a valid course directory")
3842
exit(1)
39-
submission_dir=os.path.join(course_dir,"submissions",args.gradeable)
43+
submission_dir=os.path.join(course_dir,"submissions",gradeable)
4044
if not os.path.isdir(submission_dir):
4145
print("ERROR! ",submission_dir," is not a valid gradeable submissions directory")
4246
exit(1)
4347

4448
# ===========================================================================
4549
# create the directory
46-
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
50+
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",gradeable)
4751
if not os.path.isdir(concatenated_dir):
4852
os.makedirs(concatenated_dir)
4953

5054
# ===========================================================================
5155
# walk the subdirectories
52-
for user in os.listdir(submission_dir):
56+
for user in sorted(os.listdir(submission_dir)):
5357
if not os.path.isdir(os.path.join(submission_dir,user)):
5458
continue
55-
for version in os.listdir(os.path.join(submission_dir,user)):
59+
for version in sorted(os.listdir(os.path.join(submission_dir,user))):
5660
if not os.path.isdir(os.path.join(submission_dir,user,version)):
5761
continue
5862

@@ -64,9 +68,9 @@ def main():
6468
my_concatenated_file=os.path.join(my_concatenated_dir,"submission.concatenated")
6569
with open(my_concatenated_file,'w') as my_cf:
6670
# print a brief header of information
67-
my_cf.write("SEMESTER: "+args.semester+"\n")
68-
my_cf.write("COURSE: "+args.course+"\n")
69-
my_cf.write("GRADEABLE: "+args.gradeable+"\n")
71+
my_cf.write("SEMESTER: "+semester+"\n")
72+
my_cf.write("COURSE: "+course+"\n")
73+
my_cf.write("GRADEABLE: "+gradeable+"\n")
7074
my_cf.write("USER: "+user+"\n")
7175
my_cf.write("VERSION: "+version+"\n")
7276
# loop over all files in all subdirectories
@@ -82,9 +86,10 @@ def main():
8286
# print a separator & filename
8387
my_cf.write("----------------------------------------------------\n")
8488
my_cf.write("FILE: "+relative_path+"\n\n")
85-
with open(absolute_path) as tmp:
89+
with open(absolute_path, encoding='ISO-8859-1') as tmp:
8690
# append the contents of the file
87-
my_cf.write(tmp.read()+"\n")
91+
my_cf.write(tmp.read())
92+
my_cf.write("\n")
8893

8994
print ("done")
9095

bin/hash_all.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,42 +23,37 @@
2323

2424
def parse_args():
2525
parser = argparse.ArgumentParser(description="")
26-
parser.add_argument("semester")
27-
parser.add_argument("course")
28-
parser.add_argument("gradeable")
29-
parser.add_argument("--window",type=int,default=10)
30-
parser.add_argument("--hash_size",type=int,default=100000)
31-
language = parser.add_mutually_exclusive_group(required=True)
32-
language.add_argument ("--plaintext", action='store_true')
33-
language.add_argument ("--python", action='store_true')
34-
language.add_argument ("--cpp", action='store_true')
35-
26+
parser.add_argument("config_path")
3627
args = parser.parse_args()
37-
38-
if (args.window < 1):
39-
print ("ERROR! window must be >= 1")
40-
exit(1)
41-
4228
return args
4329

4430

4531
def hasher(args,my_tokenized_file,my_hashes_file):
46-
with open(my_tokenized_file,'r') as my_tf:
32+
with open(args.config_path) as lichen_config:
33+
lichen_config_data = json.load(lichen_config)
34+
language = lichen_config_data["language"]
35+
sequence_length = int(lichen_config_data["sequence_length"])
36+
37+
if (sequence_length < 1):
38+
print ("ERROR! sequence_length must be >= 1")
39+
exit(1)
40+
41+
with open(my_tokenized_file,'r',encoding='ISO-8859-1') as my_tf:
4742
with open(my_hashes_file,'w') as my_hf:
4843
tokens = json.load(my_tf)
4944
num = len(tokens)
50-
for i in range(0,num-args.window):
45+
for i in range(0,num-sequence_length):
5146
foo=""
52-
if args.plaintext:
53-
for j in range(0,args.window):
47+
if language == "plaintext":
48+
for j in range(0,sequence_length):
5449
foo+=str(tokens[i+j].get("value"))
5550

56-
elif args.python:
57-
for j in range(0,args.window):
51+
elif language == "python":
52+
for j in range(0,sequence_length):
5853
foo+=str(tokens[i+j].get("type"))
5954

60-
elif args.cpp:
61-
for j in range(0,args.window):
55+
elif language == "cpp":
56+
for j in range(0,sequence_length):
6257
foo+=str(tokens[i+j].get("type"))
6358

6459
else:
@@ -77,26 +72,32 @@ def hasher(args,my_tokenized_file,my_hashes_file):
7772
def main():
7873
args = parse_args()
7974

75+
with open(args.config_path) as lichen_config:
76+
lichen_config_data = json.load(lichen_config)
77+
semester = lichen_config_data["semester"]
78+
course = lichen_config_data["course"]
79+
gradeable = lichen_config_data["gradeable"]
80+
8081
sys.stdout.write("HASH ALL...")
8182
sys.stdout.flush()
8283

8384
# ===========================================================================
8485
# error checking
85-
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
86+
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
8687
if not os.path.isdir(course_dir):
8788
print("ERROR! ",course_dir," is not a valid course directory")
8889
exit(1)
89-
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)
90+
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",gradeable)
9091
if not os.path.isdir(tokenized_dir):
9192
print("ERROR! ",tokenized_dir," is not a valid gradeable tokenized directory")
9293
exit(1)
9394

94-
hashes_dir=os.path.join(course_dir,"lichen","hashes",args.gradeable)
95+
hashes_dir=os.path.join(course_dir,"lichen","hashes",gradeable)
9596

9697
# ===========================================================================
9798
# walk the subdirectories
98-
for user in os.listdir(tokenized_dir):
99-
for version in os.listdir(os.path.join(tokenized_dir,user)):
99+
for user in sorted(os.listdir(tokenized_dir)):
100+
for version in sorted(os.listdir(os.path.join(tokenized_dir,user))):
100101
my_tokenized_file=os.path.join(tokenized_dir,user,version,"tokens.json")
101102

102103
# ===========================================================================
@@ -108,7 +109,6 @@ def main():
108109
my_hashes_file=os.path.join(my_hashes_dir,"hashes.txt")
109110
hasher(args,my_tokenized_file,my_hashes_file)
110111

111-
112112
print("done")
113113

114114
if __name__ == "__main__":

bin/tokenize_all.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,32 +19,29 @@
1919

2020
def parse_args():
2121
parser = argparse.ArgumentParser(description="")
22-
parser.add_argument("semester")
23-
parser.add_argument("course")
24-
parser.add_argument("gradeable")
25-
language = parser.add_mutually_exclusive_group(required=True)
26-
language.add_argument ("--plaintext", action='store_true')
27-
language.add_argument ("--python", action='store_true')
28-
language.add_argument ("--cpp", action='store_true')
22+
parser.add_argument("config_path")
2923
return parser.parse_args()
3024

31-
3225
def tokenize(args,my_concatenated_file,my_tokenized_file):
3326

34-
if args.plaintext:
27+
with open(args.config_path) as lichen_config:
28+
lichen_config_data = json.load(lichen_config)
29+
language = lichen_config_data["language"]
30+
31+
if language == "plaintext":
3532
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
3633
with open(my_concatenated_file,'r') as infile:
3734
with open (my_tokenized_file,'w') as outfile:
3835
subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)
3936

40-
elif args.python:
37+
elif language == "python":
4138
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","python_tokenizer.py")
4239
with open(my_concatenated_file,'r') as infile:
4340
with open (my_tokenized_file,'w') as outfile:
4441
command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
4542
os.system(command)
4643

47-
elif args.cpp:
44+
elif language == "cpp":
4845
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","c_tokenizer.py")
4946
with open(my_concatenated_file,'r') as infile:
5047
with open (my_tokenized_file,'w') as outfile:
@@ -62,31 +59,36 @@ def main():
6259
sys.stdout.write("TOKENIZE ALL...")
6360
sys.stdout.flush()
6461

62+
with open(args.config_path) as lichen_config:
63+
lichen_config_data = json.load(lichen_config)
64+
semester = lichen_config_data["semester"]
65+
course = lichen_config_data["course"]
66+
gradeable = lichen_config_data["gradeable"]
67+
6568
# ===========================================================================
6669
# error checking
67-
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
70+
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
6871
if not os.path.isdir(course_dir):
6972
print("ERROR! ",course_dir," is not a valid course directory")
7073
exit(1)
71-
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
74+
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",gradeable)
7275
if not os.path.isdir(concatenated_dir):
7376
print("ERROR! ",concatenated_dir," is not a valid gradeable concatenated directory")
7477
exit(1)
7578

76-
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)
79+
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",gradeable)
7780

7881
# ===========================================================================
7982
# walk the subdirectories
80-
for user in os.listdir(concatenated_dir):
81-
for version in os.listdir(os.path.join(concatenated_dir,user)):
83+
for user in sorted(os.listdir(concatenated_dir)):
84+
for version in sorted(os.listdir(os.path.join(concatenated_dir,user))):
8285
my_concatenated_file=os.path.join(concatenated_dir,user,version,"submission.concatenated")
8386

8487
# ===========================================================================
8588
# create the directory
8689
my_tokenized_dir=os.path.join(tokenized_dir,user,version)
8790
if not os.path.isdir(my_tokenized_dir):
8891
os.makedirs(my_tokenized_dir)
89-
9092
my_tokenized_file=os.path.join(my_tokenized_dir,"tokens.json")
9193
tokenize(args,my_concatenated_file,my_tokenized_file)
9294

compare_hashes/compare_hashes.cpp

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,23 @@ int main(int argc, char* argv[]) {
125125

126126
// ---------------------------------------------------------------------------
127127
// deal with command line arguments
128-
assert (argc == 6);
129-
std::string semester = argv[1];
130-
std::string course = argv[2];
131-
std::string gradeable = argv[3];
132-
assert (argv[4] == std::string("--window"));
133-
int window = std::stoi(std::string(argv[5]));
134-
assert (window >= 1);
128+
assert (argc == 2);
129+
std::string config_file = argv[1];
130+
131+
std::ifstream istr(config_file.c_str());
132+
assert (istr.good());
133+
nlohmann::json config_file_json = nlohmann::json::parse(istr);
134+
135+
std::string semester = config_file_json.value("semester","ERROR");
136+
std::string course = config_file_json.value("course","ERROR");
137+
std::string gradeable = config_file_json.value("gradeable","ERROR");
138+
std::string sequence_length_str = config_file_json.value("sequence_length","1");
139+
int sequence_length = std::stoi(sequence_length_str);
140+
std::string threshold_str = config_file_json.value("threshold","5");
141+
int threshold = std::stoi(threshold_str);
142+
143+
assert (sequence_length >= 1);
144+
assert (threshold >= 2);
135145

136146
// error checking, confirm there are hashes to work with
137147
std::string tmp = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/hashes/"+gradeable;
@@ -180,6 +190,8 @@ int main(int argc, char* argv[]) {
180190
}
181191
}
182192

193+
std::cout << "finished loading" << std::endl;
194+
183195
// ---------------------------------------------------------------------------
184196

185197
// label the parts of the file that are common to many
@@ -194,21 +206,26 @@ int main(int argc, char* argv[]) {
194206
// user,version -> ( position -> ( other user,version -> std::vector<Sequence> ) )
195207
std::map<Submission,std::map<int,std::map<Submission, std::vector<Sequence> > > > suspicious;
196208

209+
int my_counter = 0;
197210

198211
// ---------------------------------------------------------------------------
199212
// walk over the structure containing all of the hashes identifying
200213
// common to many/all, provided code, suspicious matches, and unique code
201214
for (hashed_sequences::iterator itr = hash_counts.begin(); itr != hash_counts.end(); itr++) {
202215
int count = itr->second.size();
203216

204-
if (count >= 20) {
217+
my_counter++;
218+
219+
std::cout << "hash walk " << hash_counts.size() << " " << my_counter << std::endl;
220+
221+
if (count > threshold) {
205222
// common to many/all
206223
for (std::map<std::string,std::vector<Sequence> >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) {
207224
for (int i = 0; i < itr2->second.size(); i++) {
208225
common[itr2->second[i].submission].insert(itr2->second[i].position);
209226
}
210227
}
211-
} else if (count > 1 && count < 20) {
228+
} else if (count > 1 && count <= threshold) {
212229
// suspicious matches
213230
for (std::map<std::string,std::vector<Sequence> >::iterator itr2 = itr->second.begin(); itr2 != itr->second.end(); itr2++) {
214231
std::string username = itr2->first;
@@ -234,6 +251,7 @@ int main(int argc, char* argv[]) {
234251
}
235252
}
236253

254+
std::cout << "finished walking" << std::endl;
237255

238256
// ---------------------------------------------------------------------------
239257
// prepare a sorted list of all users sorted by match percent

tokenizer/plaintext/plaintext_tokenizer.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,19 @@ void usage(const std::string &program) {
1313
}
1414

1515

16+
void deal_with_number(std::map<std::string,nlohmann::json>& tmp, const std::string& token) {
17+
try {
18+
// normal case, convert to integer
19+
tmp["type"]="number";
20+
tmp["value"]=std::stoi(token);
21+
} catch (...) {
22+
// if conversion fails (integer too big!)
23+
tmp["type"]="string";
24+
tmp["value"]=token;
25+
}
26+
}
27+
28+
1629
int main(int argc, char* argv[]) {
1730

1831
// ------------------------------
@@ -72,8 +85,7 @@ int main(int argc, char* argv[]) {
7285
tmp["char"]=start_col;
7386
if (last_was_digit) {
7487
assert (!last_was_alpha);
75-
tmp["type"]="number";
76-
tmp["value"]=std::stoi(token);
88+
deal_with_number(tmp,token);
7789
} else {
7890
assert (last_was_alpha);
7991
tmp["type"]="string";
@@ -171,8 +183,7 @@ int main(int argc, char* argv[]) {
171183
tmp["char"]=start_col;
172184
if (last_was_digit) {
173185
assert (!last_was_alpha);
174-
tmp["type"]="number";
175-
tmp["value"]=std::stoi(token);
186+
deal_with_number(tmp,token);
176187
} else {
177188
assert (last_was_alpha);
178189
tmp["type"]="string";

0 commit comments

Comments
 (0)