diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..e5b431a --- /dev/null +++ b/data/README.md @@ -0,0 +1,26 @@ +# Example Usage + + head -n20 desktop_heatmap_2012.json | python eventify.py > events.txt + python faststat.py ` grep send context-menu.txt | cut -d, -f3 | sort | uniq | xargs` events.txt + + +# Main ideas / design goals: + +* filter for 'stream of events' +* multiple input lines might be 'one event' here. +* transform for 'simple stats' +* more generators +* separate the 'generate stream of events' from the 'map this to +heatmap' +* events should be 'logical' / semantic +* one event per line, so there is a hope in heaven of using grep on them +* 'terminal events'? Is this a good idea or not? + - it means that users with no events show up + - easy to grep for + - but a little gross (see: null strings in C) +* 'more attributes' should be 'extra' (like type of click? num +windows?). Specficially, events should be able to be rolled up several +ways? +* events in the output should be 'independent'? +* should we call out 'inferred events' (like shutdown, switch window?) + diff --git a/data/eventify.py b/data/eventify.py new file mode 100644 index 0000000..96e96b2 --- /dev/null +++ b/data/eventify.py @@ -0,0 +1,129 @@ +import simplejson +import csv +from itertools import chain +import fileinput +#import random +import hashlib +import time + +def tryparse(json): + try: + return simplejson.loads(json) + except: + return None + +def parse_input_file(fh): + """ + + """ + return filter(None,(tryparse(l.strip().split('\t')[1]) for l in fh)) + +LASTEVENT='THISISTHELASTEVENT' + + +def map_events_for_person(in_events): + """ yields a gen of summarized events. + + This duplicates (badly) ilana's code, sadly + """ + # [1, 'window', '4', 'window closed', 1335290155985] + # [1, 'menus', 'key_newNavigatorTab', 'key shortcut', 1335292079565], + + def stringify(event): return "&".join(map(str,event)) + + in_events = list(in_events) # no generators here. + for (ii,x) in enumerate(in_events): + #print x + t,area,sub1,sub2,ts = x + event = x + + # special cases + if t == 3: continue # stats, counts, etc. + if t == 0: continue # study startup, etc. + + ## gory gory details... + ## TODO clean up. + elemid = '--' + if event[1] in ["menu_FilePopup", "historyUndoMenu", "historyUndoWindowMenu", "file-menu", "menu_EditPopup", "menu_viewPopup", "view-menu", "goPopup", "tools-menu", "history-menu", "menu_ToolsPopup", "bookmarksMenu", "bookmarksMenuPopup", "windowPopup", "menu_HelpPopup", "helpMenu", "windowMenu"] and event[3] == "mouse": + if event[1] == "windowMenu" and event[2][:6] == "window": + elemid = "go-to-window" + elif event[1] == "bookmarksMenu" and event[2] == "user-defined item": + elemid = "personal-bookmarks" + elif event[1] == "history-menu" and event[2] == "user-defined item": + elemid = "recently-visited-pages" + elif event[1] == "historyUndoMenu" and event[2] == "user-defined item": + elemid = "recently-closed-tabs" + elif event[1] == "historyUndoWindowMenu" and event[2] == "user-defined item": + elemid = "recently-closed-windows" + else: + elemid = event[2] + + yield area,sub1,sub2,elemid, ts + + yield LASTEVENT,'','',0 # a sigil + +from collections import Counter, defaultdict + +def pairs(C,gen): + """ on a person! assumes only one LASTEVENT per """ + first = gen.next() + for second in gen: + if second == LASTEVENT: + break + else: + C[first][second] +=1 + first = second + + return None # side effect, gross + +## we gots to do this matrix style, alas. +def outfiles(C,fn_csv,fn_adj_json, fn_adj_json2): + """ outputs csv and json of corrmatrix""" + + ## all have to appear on both sides... full adj matrix + K = list(chain(*[C[x].keys() for x in C.keys()])) + C.keys() + K = sorted(set(K)) + + with open(fn_csv,'w') as fh_csv: + fh_csv = csv.writer(fh_csv) + fh_csv.writerow(['name','color']) + for k in K: + #fh_csv.writerow(k,"%s" % (hex(random.getrandbits(8*3)[2:]))) + color = "#"+ hashlib.md5(k).hexdigest()[:6] + fh_csv.writerow([k,color]) + + s = float(sum([sum(C[k].values())for k in K])) + matrix = [] + for k in K: + my = C[k] + pcts = [my[j] / s for j in K] + matrix.append(pcts) + + simplejson.dump(matrix, open(fn_adj_json,'w'),indent=2) + + matrix2 = [] # within group pcts + for k in K: + my = C[k] + s = float(sum(my.values())) + pcts = [my[j] / s for j in K] + matrix2.append(pcts) + + simplejson.dump(matrix2, open(fn_adj_json2,'w'),indent=2) + + return True + + +def main(fn): + people = parse_input_file(fn) + events = (x['events'] for x in people) + meta_gen = (map_events_for_person(x) for x in events) + #C = defaultdict(Counter) + #modify C by side-effect + #for gen in meta_gen: pairs(C,gen) + #outfiles(C,'actions.csv','matrix.json','matrix2.json') + for (ii,gen) in enumerate(meta_gen): + for x in gen: + print simplejson.dumps([ii,x,time.ctime(float(x[-1]/1000)).strip()]) + +if __name__ == "__main__": + main(fileinput.input()) diff --git a/data/faststat.py b/data/faststat.py new file mode 100644 index 0000000..d5b19ce --- /dev/null +++ b/data/faststat.py @@ -0,0 +1,48 @@ +""" +args: + needle [...] filename +""" + +import fileinput +import simplejson +from collections import Counter +from itertools import chain +from math import floor + +# for each line, does it match +# record % of people having it +# record distribution of use for people having it + + +def summary(counts): + n = sum(counts.values()) + p = len(counts) + mean = n / float(p) + indices = [int(floor(k * p)) for k in (.05,.25,.50,.75,.95)] + reexpanded = sorted(counts.itervalues()) + qs = [reexpanded[i] for i in indices] + + return n,mean,qs + + +# this is sloppy parse bs +# we could be smarter here and do all events at once. +def faststat(needle,lines): + n = 0 + allids = set() + C = Counter() + for line in lines: + n += 1 + person,evt,timestr = simplejson.loads(line.strip()) + allids.add(person) + if needle in line: + C[person] += 1 + + return dict(needle=needle, nevents=n,npeople=len(allids),pct=100*float(len(C))/len(allids), summary=summary(C)) + +if __name__ == "__main__": + import sys + needles = sys.argv[1:-1] + for n in needles: + print faststat(n,open(sys.argv[-1])) + diff --git a/data/parseback.py b/data/parseback.py new file mode 100644 index 0000000..c972218 --- /dev/null +++ b/data/parseback.py @@ -0,0 +1,81 @@ +from itertools import chain +import simplejson +import fileinput +from collections import Counter, defaultdict + +# globals, yay! +me = [] +C = Counter() +Longrunners = Counter() + +prev = 0 +hasany = set() + +def complete(): + global me + lastaction = ('(unknown)','','') + if 'back' not in me[-1]: + lastaction = tuple(me.pop(-1)[1][:3]) + + n = len(me) + C[n] += 1 + if n >= 4: + Longrunners[lastaction] += 1 + + if n > 100: print "LONG\n", "\n".join(map(str,me)) + me = [] + +for line in fileinput.input(): + line = line.strip() + if line == "--": + complete() + else: + d = simplejson.loads(line) + me.append(d) + hasany.add(d[0]) + + +def summary(counts): + n = sum(counts.values()) + mean = sum((k*v for k,v in counts.iteritems())) / float(n) + # ugh! turn back into list... + indices = [int(k * n) for k in (.05,.25,.50,.75,.95)] + reexpanded = list(chain(*([k,]*v for (k,v) in sorted(counts.iteritems())))) + qs = [reexpanded[i] for i in indices] + + return n,mean,qs + + +n = sum(C.values()) +print C.most_common() +for k in sorted(C.iteritems()): + k1 = list(k[:]) + [k[1]/float(n),] + print '{0}\t{2:.1%}\t{1}'.format(*k1) + +nr = sum(Longrunners.values()) +for k in Longrunners.most_common(20): + k1 = list(k[:]) + [k[1]/float(nr),] + print '{0}\t{2:.1%}\t{1}'.format(*k1) + + + +print "hasany", len(hasany) +print summary(C) + + + +TODO = """ + +for 'long' (4 or more), what do they do at the end of it? +break up 'long runs', and ensure we aren't 'mingling' +get 'event' code from Ilana. + +does swipe trigger the back button in this study? + +back button is very popular (of 85% usage) + +sequences of clicks can imply user desire, without asking them. +""" + + +print TODO