ci: expand python lint and formatting (VowpalWabbit#3800)

* ci: expand python lint and formatting * Update lint.yml * Update lint.yml
bassmang · Mar 17, 2022 · 104cb18 · 104cb18
1 parent dde8b11
commit 104cb18
Show file tree

Hide file tree

Showing 19 changed files with 284 additions and 205 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -53,6 +53,8 @@ jobs:
         run: |
           pip install -r requirements.txt
           pip install pytype
+          # required for test and utl directory typecheck
+          pip install hyperopt matplotlib seaborn
       - name: Install wheel
         shell: bash
         run: |
@@ -64,6 +66,8 @@ jobs:
         shell: bash
         run: |
           python -m pytype ./python/vowpalwabbit/ --verbosity=2
+          python -m pytype ./test/ --verbosity=2
+          python -m pytype ./utl/ --verbosity=2
   python-formatting:
     name: python.formatting
     runs-on: ubuntu-latest
@@ -76,7 +80,7 @@ jobs:
       - run: pip install black
       - shell: bash
         run: |
-          python -m black --check python/vowpalwabbit || (echo -e "---\nTo fix, run:\n\tpython -m black python/vowpalwabbit"; exit 1)
+          python -m black --check . --exclude ext_libs/ || (echo -e "---\nTo fix, run:\n\tpython -m black . --exclude ext_libs"; exit 1)
   cpp-formatting:
     name: c++.formatting
     runs-on: ubuntu-20.04

diff --git a/big_tests/testCode/ocr2vw.py b/big_tests/testCode/ocr2vw.py
@@ -1,63 +1,71 @@
 #!/usr/bin/env python
 # convert letter.data to letter.vw
 
-def read_letter_names (fn):
+
+def read_letter_names(fn):
     ret = list()
     with open(fn) as ins:
         for line in ins:
             ret.append(line.rstrip())
-    print "Read %d names from %s" % (len(ret),fn)
+    print("Read %d names from %s" % (len(ret), fn))
     return ret
 
-def find_pixel_start (names):
+
+def find_pixel_start(names):
     for i in range(len(names)):
         if names[i].startswith("p_"):
             return i
-    raise ValueError("No pixel data",names)
+    raise ValueError("No pixel data", names)
+
 
-def data2vw (ifn, train, test, names):
+def data2vw(ifn, train, test, names):
     lineno = 0
     trainN = 0
     testN = 0
     if ifn.endswith(".gz"):
         import gzip
+
         iopener = gzip.open
     else:
         iopener = open
     id_pos = names.index("id")
     letter_pos = names.index("letter")
     pixel_start = find_pixel_start(names)
-    with iopener(ifn) as ins, open(train,"wb") as trainS, open(test,"wb") as testS:
+    with iopener(ifn) as ins, open(train, "wb") as trainS, open(test, "wb") as testS:
         for line in ins:
             lineno += 1
-            vals = line.rstrip().split('\t')
+            vals = line.rstrip().split("\t")
             if len(vals) != len(names):
-                raise ValueError("Bad field count",
-                                 len(vals),len(names),vals,names)
+                raise ValueError("Bad field count", len(vals), len(names), vals, names)
             char = vals[letter_pos]
             if len(char) != 1:
-                raise ValueError("Bad letter",char)
+                raise ValueError("Bad letter", char)
             if lineno % 10 == 0:
                 testN += 1
                 outs = testS
             else:
                 trainN += 1
                 outs = trainS
-            outs.write("%d 1 %s-%s|Pixel" % (ord(char)-ord('a')+1,char,vals[id_pos]))
-            for i in range(pixel_start,len(names)):
-                if vals[i] != '0':
-                    outs.write(' %s:%s' % (names[i],vals[i]))
-            outs.write('\n')
-    print "Read %d lines from %s; wrote %d lines into %s and %d lines into %s" % (
-        lineno,ifn,trainN,train,testN,test)
+            outs.write(
+                "%d 1 %s-%s|Pixel" % (ord(char) - ord("a") + 1, char, vals[id_pos])
+            )
+            for i in range(pixel_start, len(names)):
+                if vals[i] != "0":
+                    outs.write(" %s:%s" % (names[i], vals[i]))
+            outs.write("\n")
+    print(
+        "Read %d lines from %s; wrote %d lines into %s and %d lines into %s"
+        % (lineno, ifn, trainN, train, testN, test)
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description='Convert letters.data to VW format')
-    parser.add_argument('input',help='path to letter.data[.gz]')
-    parser.add_argument('names',help='path to letter.names')
-    parser.add_argument('train',help='VW train file location (90%)')
-    parser.add_argument('test',help='VW test file location (10%)')
+
+    parser = argparse.ArgumentParser(description="Convert letters.data to VW format")
+    parser.add_argument("input", help="path to letter.data[.gz]")
+    parser.add_argument("names", help="path to letter.names")
+    parser.add_argument("train", help="VW train file location (90%)")
+    parser.add_argument("test", help="VW test file location (10%)")
     args = parser.parse_args()
-    data2vw(args.input,args.train,args.test,read_letter_names(args.names))
+    data2vw(args.input, args.train, args.test, read_letter_names(args.names))
diff --git a/demo/advertising/naive_baseline.py b/demo/advertising/naive_baseline.py
@@ -6,87 +6,96 @@
 from os import devnull
 
 # The learning algorithm is vowpal wabbit, available at https://github.com/JohnLangford/vowpal_wabbit/wiki
-vw_train_cmd = '../../vowpalwabbit/vw  -c -f model --bfgs --passes 30 -b 22 --loss_function logistic --l2 14 --termination 0.00001 --holdout_off'
-vw_test_cmd = '../../vowpalwabbit/vw  -t -i model -p /dev/stdout'
+vw_train_cmd = "../../vowpalwabbit/vw  -c -f model --bfgs --passes 30 -b 22 --loss_function logistic --l2 14 --termination 0.00001 --holdout_off"
+vw_test_cmd = "../../vowpalwabbit/vw  -t -i model -p /dev/stdout"
+
 
 def get_features(line):
     feat = line[2:]
     # Bucketizing the integer features on a logarithmic scale
     for i in range(8):
-        if feat[i]: 
+        if feat[i]:
             v = int(feat[i])
-            if v>0:
-                feat[i] = str(int(log(v+0.5)/log(1.5)))
-    return ' '.join(['%d_%s' % (i,v) for i,v in enumerate(feat) if v])
+            if v > 0:
+                feat[i] = str(int(log(v + 0.5) / log(1.5)))
+    return " ".join(["%d_%s" % (i, v) for i, v in enumerate(feat) if v])
+
 
 def train_test_oneday(day):
-    ts_beginning_test = 86400*(day-1)
+    ts_beginning_test = 86400 * (day - 1)
 
-    with open('data.txt') as f:
+    with open("data.txt") as f:
         line = f.readline()
 
         # Beginning of the training set: 3 weeks before the test period
-        while int(line.split()[0]) < ts_beginning_test - 86400*21:
+        while int(line.split()[0]) < ts_beginning_test - 86400 * 21:
             line = f.readline()
 
-        call('rm -f .cache', shell=True)
+        call("rm -f .cache", shell=True)
         vw = Popen(vw_train_cmd, shell=True, stdin=PIPE)
 
-        print '---------- Training on days %d to %d ----------------' % (day-21, day-1) 
-        print
+        print(
+            "---------- Training on days %d to %d ----------------"
+            % (day - 21, day - 1)
+        )
+        print()
 
         while int(line.split()[0]) < ts_beginning_test:
-            line = line[:-1].split('\t')
+            line = line[:-1].split("\t")
 
             label = -1
             if line[1]:
                 conv_ts = int(line[1])
-                if conv_ts < ts_beginning_test: 
-                    label = 1 # Positive label iff conversion and the conversion occured before the test period
+                if conv_ts < ts_beginning_test:
+                    label = 1  # Positive label iff conversion and the conversion occured before the test period
 
-            out = '%d | %s' % (label, get_features(line))
-            print >>vw.stdin, out
+            out = "%d | %s" % (label, get_features(line))
+            print >> vw.stdin, out
             line = f.readline()
 
         vw.stdin.close()
         vw.wait()
 
-        print
-        print '---------- Testing on day %d ----------------' % (day-21)
-
-        vw = Popen(vw_test_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=open(devnull, 'w'))
+        print()
+        print("---------- Testing on day %d ----------------" % (day - 21))
+
+        vw = Popen(
+            vw_test_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=open(devnull, "w")
+        )
         ll = 0
         n = 0
 
         # Test is one day long
         while int(line.split()[0]) < ts_beginning_test + 86400:
-            line = line[:-1].split('\t')
+            line = line[:-1].split("\t")
 
-            print >>vw.stdin, '| '+get_features(line)
+            print >> vw.stdin, "| " + get_features(line)
             dotproduct = float(vw.stdout.readline())
-    
+
             # Test log likelihood
-            if line[1]: # Positive example
-                ll += log(1+exp(-dotproduct))
-            else: # Negative sample
-                ll += log(1+exp(dotproduct))
+            if line[1]:  # Positive example
+                ll += log(1 + exp(-dotproduct))
+            else:  # Negative sample
+                ll += log(1 + exp(dotproduct))
             n += 1
 
             line = f.readline()
 
         return (ll, n)
 
+
 def main():
     ll = 0
     n = 0
     # Iterating over the 7 test days
-    for day in range(54,61):
+    for day in range(54, 61):
         ll_day, n_day = train_test_oneday(day)
         ll += ll_day
         n += n_day
-        print ll_day, n_day
-    print
-    print 'Average test log likelihood: %f' % (ll/n)
-
+        print(ll_day, n_day)
+    print()
+    print("Average test log likelihood: %f" % (ll / n))
+
+
 if __name__ == "__main__":
     main()
diff --git a/demo/dependencyparsing/evaluate.py b/demo/dependencyparsing/evaluate.py
@@ -5,22 +5,26 @@
 import sys
 from collections import defaultdict
 
+
 def pc(num, den):
-    return (num / float(den+1e-100)) * 100
+    return (num / float(den + 1e-100)) * 100
+
 
 def fmt_acc(label, n, l_corr, u_corr, total_errs):
     l_pc = pc(l_corr, n)
     u_pc = pc(u_corr, n)
     err_pc = pc(n - l_corr, total_errs)
-    return '%s\t%d\t%.3f\t%.3f\t%.3f' % (label, n, l_pc, u_pc, err_pc)
+    return "%s\t%d\t%.3f\t%.3f\t%.3f" % (label, n, l_pc, u_pc, err_pc)
 
 
 def gen_toks(loc):
-    sent_strs = open(str(loc)).read().strip().split('\n\n')
+    sent_strs = open(str(loc)).read().strip().split("\n\n")
     token = None
     i = 0
     for sent_str in sent_strs:
-        tokens = [Token(i, tok_str.split()) for i, tok_str in enumerate(sent_str.split('\n'))]
+        tokens = [
+            Token(i, tok_str.split()) for i, tok_str in enumerate(sent_str.split("\n"))
+        ]
         for token in tokens:
             yield sent_str, token
 
@@ -37,24 +41,24 @@ def __init__(self, id_, attrs):
             new_attrs.append(attrs[-3])
             attrs = new_attrs
         self.label = attrs[-1]
-        if self.label.lower() == 'root':
-            self.label = 'ROOT'
+        if self.label.lower() == "root":
+            self.label = "ROOT"
         try:
             head = int(attrs[-2])
         except:
             try:
-                self.label = 'P'
+                self.label = "P"
                 head = int(attrs[-1])
             except:
-                print attrs
+                print(attrs)
                 raise
         attrs.pop()
         attrs.pop()
         self.head = head
         self.pos = attrs.pop()
         self.word = attrs.pop()
-        self.dir = 'R' if head >= 0 and head < self.id else 'L'
-    
+        self.dir = "R" if head >= 0 and head < self.id else "L"
+
 
 def mymain(test_loc, gold_loc, eval_punct=False):
     if not os.path.exists(test_loc):
@@ -67,7 +71,7 @@ def mymain(test_loc, gold_loc, eval_punct=False):
     l_nc = 0
     for (sst, t), (ss, g) in zip(gen_toks(test_loc), gen_toks(gold_loc)):
         if not eval_punct and g.word in ",.-;:'\"!?`{}()[]":
-			continue
+            continue
         prev_g = g
         prev_t = t
         u_c = g.head == t.head
@@ -79,7 +83,7 @@ def mymain(test_loc, gold_loc, eval_punct=False):
         u_by_label[g.dir][g.label] += u_c
         l_by_label[g.dir][g.label] += l_c
     n_l_err = N - l_nc
-    for D in ['L', 'R']:
+    for D in ["L", "R"]:
         n_other = 0
         l_other = 0
         u_other = 0
@@ -93,12 +97,13 @@ def mymain(test_loc, gold_loc, eval_punct=False):
             else:
                 l_corr = l_by_label[D][label]
                 u_corr = u_by_label[D][label]
-    yield 'U: %.3f' % pc(u_nc, N)
-    yield 'L: %.3f' % pc(l_nc, N)
+    yield "U: %.3f" % pc(u_nc, N)
+    yield "L: %.3f" % pc(l_nc, N)
+
 
-if __name__ == '__main__':
-	if(sys.argv < 3):
-		print 'Usage: parsed_pred_file gold_test_conll_file'
-		sys.exit(0)
-	for line in  mymain(sys.argv[1], sys.argv[2], eval_punct=False):
-		print line
+if __name__ == "__main__":
+    if sys.argv < 3:
+        print("Usage: parsed_pred_file gold_test_conll_file")
+        sys.exit(0)
+    for line in mymain(sys.argv[1], sys.argv[2], eval_punct=False):
+        print(line)