Skip to content

Commit 65575a4

Browse files
[Feature:Plagiarism] Improve run log readability (#74)
* Only print warning once * add number of times it was truncated * Add progress bars for most of pipeline * Add compare_hashes progress bar
1 parent 791258f commit 65575a4

File tree

4 files changed

+92
-35
lines changed

4 files changed

+92
-35
lines changed

bin/concatenate_all.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import os
99
import sys
1010
import json
11-
import time
11+
import datetime
1212
import humanize
1313
import fnmatch
1414
import hashlib
@@ -186,10 +186,10 @@ def validate(config, args):
186186

187187

188188
def main():
189-
start_time = time.time()
189+
start_time = datetime.datetime.now()
190190
args = parse_args()
191191

192-
print("CONCATENATE ALL...", end="")
192+
print("CONCATENATE ALL:", flush=True)
193193

194194
config_path = os.path.join(args.basepath, "config.json")
195195
if not os.path.isfile(config_path):
@@ -291,9 +291,8 @@ def main():
291291
checkTotalSize(total_concat)
292292

293293
# ==========================================================================
294-
end_time = time.time()
295-
print("done in " + "%.0f" % (end_time - start_time) + " seconds,",
296-
humanize.naturalsize(total_concat) + " concatenated")
294+
print("Concatenation done in", humanize.precisedelta(start_time, format="%1.f") + ",",
295+
humanize.naturalsize(total_concat), "concatenated")
297296

298297

299298
if __name__ == "__main__":

bin/hash_all.py

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
import argparse
99
import os
1010
import json
11-
import time
1211
import hashlib
1312
from pathlib import Path
13+
import humanize
14+
import datetime
1415

1516

1617
def parse_args():
@@ -48,7 +49,7 @@ def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
4849

4950

5051
def main():
51-
start_time = time.time()
52+
start_time = datetime.datetime.now()
5253
args = parse_args()
5354

5455
with open(Path(args.basepath, "config.json")) as lichen_run_config_file:
@@ -57,13 +58,26 @@ def main():
5758
with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:
5859
lichen_config = json.load(lichen_config_file)
5960

60-
print("HASH ALL...", end="")
61+
print("HASH ALL:", flush="True")
62+
print("[0% 25% 50% 75% 100%]\n[", end="", flush=True) # noqa: E501
63+
64+
users_dir = os.path.join(args.basepath, "users")
65+
if not os.path.isdir(users_dir):
66+
raise SystemExit("ERROR: Unable to find users directory")
67+
68+
other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
69+
if not os.path.isdir(other_gradeables_dir):
70+
raise SystemExit("ERROR: Unable to find other gradeables directory")
71+
72+
total_users = len(os.listdir(users_dir))
73+
for dir in os.listdir(other_gradeables_dir):
74+
total_users += len(os.listdir(os.path.join(other_gradeables_dir, dir)))
75+
76+
users_hashed = 0
77+
percent_progress = 0
6178

6279
# ==========================================================================
6380
# walk the subdirectories of this gradeable
64-
users_dir = Path(args.basepath, "users")
65-
if not os.path.isdir(users_dir):
66-
raise SystemExit("ERROR! Unable to find users directory")
6781

6882
for user in sorted(os.listdir(users_dir)):
6983
user_dir = Path(users_dir, user)
@@ -79,13 +93,15 @@ def main():
7993
my_hashes_file = Path(my_dir, "hashes.txt")
8094
hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file)
8195

96+
users_hashed += 1
97+
if int((users_hashed / total_users) * 100) > percent_progress:
98+
new_percent_progress = int((users_hashed / total_users) * 100)
99+
print("|" * (new_percent_progress - percent_progress), end="", flush=True)
100+
percent_progress = new_percent_progress
101+
82102
# ==========================================================================
83103
# walk the subdirectories of the other gradeables
84104

85-
other_gradeables_dir = Path(args.basepath, "other_gradeables")
86-
if not os.path.isdir(other_gradeables_dir):
87-
raise SystemExit("ERROR! Unable to find other gradeables directory")
88-
89105
for other_gradeable in sorted(os.listdir(other_gradeables_dir)):
90106
other_gradeable_dir = Path(other_gradeables_dir, other_gradeable)
91107
if not os.path.isdir(other_gradeable_dir):
@@ -105,15 +121,20 @@ def main():
105121
other_hashes_file = Path(other_version_dir, "hashes.txt")
106122
hasher(lichen_config, lichen_run_config, other_tokenized_file, other_hashes_file)
107123

124+
users_hashed += 1
125+
if int((users_hashed / total_users) * 100) > percent_progress:
126+
new_percent_progress = int((users_hashed / total_users) * 100)
127+
print("|" * (new_percent_progress - percent_progress), end="", flush=True)
128+
percent_progress = new_percent_progress
129+
108130
# ==========================================================================
109131
# hash the provided code
110132
provided_code_tokenized = Path(args.basepath, "provided_code", "tokens.json")
111133
provided_code_hashed = Path(args.basepath, "provided_code", "hashes.txt")
112134
hasher(lichen_config, lichen_run_config, provided_code_tokenized, provided_code_hashed)
113135

114136
# ==========================================================================
115-
end_time = time.time()
116-
print("done in " + "%.0f" % (end_time - start_time) + " seconds")
137+
print("]\nHashing done in", humanize.precisedelta(start_time, format="%1.f"))
117138

118139

119140
if __name__ == "__main__":

bin/tokenize_all.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import argparse
77
import os
88
import json
9-
import time
9+
import humanize
10+
import datetime
1011

1112

1213
def parse_args():
@@ -43,19 +44,34 @@ def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
4344

4445

4546
def main():
46-
start_time = time.time()
47+
start_time = datetime.datetime.now()
4748
args = parse_args()
4849

49-
print("TOKENIZE ALL...", end="", flush=True)
50+
print("TOKENIZE ALL:", flush=True)
51+
print("[0% 25% 50% 75% 100%]\n[", end="", flush=True) # noqa: E501
5052

5153
with open(os.path.join(args.basepath, "config.json")) as lichen_config:
5254
lichen_config_data = json.load(lichen_config)
5355

54-
# ===========================================================================
55-
# walk the subdirectories to tokenize this gradeable's submissions
5656
users_dir = os.path.join(args.basepath, "users")
5757
if not os.path.isdir(users_dir):
58-
raise SystemExit("ERROR! Unable to find users directory")
58+
raise SystemExit("ERROR: Unable to find users directory")
59+
60+
other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
61+
if not os.path.isdir(other_gradeables_dir):
62+
raise SystemExit("ERROR: Unable to find other gradeables directory")
63+
64+
# We'll make a rough estimate of the percentage of tokenization done by
65+
# taking the percentage of users which have been tokenized thus far
66+
total_users = len(os.listdir(users_dir))
67+
for dir in os.listdir(other_gradeables_dir):
68+
total_users += len(os.listdir(os.path.join(other_gradeables_dir, dir)))
69+
70+
users_tokenized = 0
71+
percent_progress = 0
72+
73+
# ===========================================================================
74+
# walk the subdirectories to tokenize this gradeable's submissions
5975

6076
for user in sorted(os.listdir(users_dir)):
6177
user_dir = os.path.join(users_dir, user)
@@ -71,11 +87,14 @@ def main():
7187
my_tokenized_file = os.path.join(my_dir, "tokens.json")
7288
tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file)
7389

90+
users_tokenized += 1
91+
if int((users_tokenized / total_users) * 100) > percent_progress:
92+
new_percent_progress = int((users_tokenized / total_users) * 100)
93+
print("|" * (new_percent_progress - percent_progress), end="", flush=True)
94+
percent_progress = new_percent_progress
95+
7496
# ===========================================================================
7597
# tokenize the other other gradeables' submissions
76-
other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
77-
if not os.path.isdir(other_gradeables_dir):
78-
raise SystemExit("ERROR! Unable to find other gradeables directory")
7998

8099
for other_gradeable in sorted(os.listdir(other_gradeables_dir)):
81100
other_gradeable_dir = os.path.join(other_gradeables_dir, other_gradeable)
@@ -96,15 +115,20 @@ def main():
96115
other_tokenized_file = os.path.join(other_version_dir, "tokens.json")
97116
tokenize(lichen_config_data, other_concatenated_file, other_tokenized_file)
98117

118+
users_tokenized += 1
119+
if int((users_tokenized / total_users) * 100) > percent_progress:
120+
new_percent_progress = int((users_tokenized / total_users) * 100)
121+
print("|" * (new_percent_progress - percent_progress), end="", flush=True)
122+
percent_progress = new_percent_progress
123+
99124
# ===========================================================================
100125
# tokenize the provided code
101126
provided_code_concat = os.path.join(args.basepath, "provided_code", "submission.concatenated")
102127
provided_code_tokenized = os.path.join(args.basepath, "provided_code", "tokens.json")
103128
tokenize(lichen_config_data, provided_code_concat, provided_code_tokenized)
104129

105130
# ==========================================================================
106-
end_time = time.time()
107-
print("done in " + "%.0f" % (end_time - start_time) + " seconds")
131+
print("]\nTokenization done in", humanize.precisedelta(start_time, format="%1.f"))
108132

109133

110134
if __name__ == "__main__":

compare_hashes/compare_hashes.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) {
9999
// MAIN
100100

101101
int main(int argc, char* argv[]) {
102-
std::cout << "COMPARE HASHES...";
102+
std::cout << "COMPARE HASHES:" << std::endl;
103103
fflush(stdout);
104104
time_t overall_start, overall_end;
105105
time(&overall_start);
@@ -253,7 +253,7 @@ int main(int argc, char* argv[]) {
253253

254254
time(&end);
255255
double diff = difftime(end, start);
256-
std::cout << "finished loading in " << diff << " seconds" << std::endl;
256+
std::cout << "Finished loading in " << diff << " seconds" << std::endl;
257257

258258

259259
// ===========================================================================
@@ -264,6 +264,9 @@ int main(int argc, char* argv[]) {
264264
int my_percent = 0;
265265
time(&start);
266266

267+
std::cout << "[0% 25% 50% 75% 100%]" << std::endl << "[";
268+
fflush(stdout);
269+
267270
// walk over every Submission
268271
for (std::vector<Submission*>::iterator submission_itr = all_submissions.begin();
269272
submission_itr != all_submissions.end(); ++submission_itr) {
@@ -588,14 +591,23 @@ int main(int argc, char* argv[]) {
588591
// print current progress
589592
my_counter++;
590593
if (int((my_counter / float(all_submissions.size())) * 100) > my_percent) {
591-
my_percent = int((my_counter / float(all_submissions.size())) * 100);
592-
std::cout << "Processing submissions: " << my_percent << "% complete" << std::endl;
594+
int new_my_percent = int((my_counter / float(all_submissions.size())) * 100);
595+
for (int i=0; i < new_my_percent - my_percent; i++) {
596+
std::cout << "|";
597+
}
598+
fflush(stdout);
599+
my_percent = new_my_percent;
593600
}
594601
}
595602

603+
// Finish printing any remaining portion of the progress bar
604+
for (int i=0; i < 100 - my_percent; i++) {
605+
std::cout << "|";
606+
}
607+
596608
time(&end);
597609
diff = difftime(end, start);
598-
std::cout << "Finished processing submissions in " << diff << " seconds" << std::endl;
610+
std::cout << "]" << std::endl << "Finished processing submissions in " << diff << " seconds" << std::endl;
599611

600612
// Print out the list of users who had their matching positions array truncated
601613
if (matching_positions_truncations.size() > 0) {
@@ -606,6 +618,7 @@ int main(int argc, char* argv[]) {
606618
}
607619
std::cout << std::endl << " - Try increasing the hash size or adding a regex to fix this problem." << std::endl;
608620
}
621+
fflush(stdout);
609622

610623
// ===========================================================================
611624
// Create a general summary of rankings of users by percentage match
@@ -645,5 +658,5 @@ int main(int argc, char* argv[]) {
645658

646659
time(&overall_end);
647660
double overall_diff = difftime(overall_end, overall_start);
648-
std::cout << "COMPARE HASHES done in " << overall_diff << " seconds" << std::endl;
661+
std::cout << "Hash comparison done in " << overall_diff << " seconds" << std::endl;
649662
}

0 commit comments

Comments
 (0)