Skip to content

Commit

Permalink
ci: expand python lint and formatting (VowpalWabbit#3800)
Browse files Browse the repository at this point in the history
* ci: expand python lint and formatting

* Update lint.yml

* Update lint.yml
  • Loading branch information
jackgerrits authored Mar 17, 2022
1 parent dde8b11 commit 104cb18
Show file tree
Hide file tree
Showing 19 changed files with 284 additions and 205 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ jobs:
run: |
pip install -r requirements.txt
pip install pytype
# required for test and utl directory typecheck
pip install hyperopt matplotlib seaborn
- name: Install wheel
shell: bash
run: |
Expand All @@ -64,6 +66,8 @@ jobs:
shell: bash
run: |
python -m pytype ./python/vowpalwabbit/ --verbosity=2
python -m pytype ./test/ --verbosity=2
python -m pytype ./utl/ --verbosity=2
python-formatting:
name: python.formatting
runs-on: ubuntu-latest
Expand All @@ -76,7 +80,7 @@ jobs:
- run: pip install black
- shell: bash
run: |
python -m black --check python/vowpalwabbit || (echo -e "---\nTo fix, run:\n\tpython -m black python/vowpalwabbit"; exit 1)
python -m black --check . --exclude ext_libs/ || (echo -e "---\nTo fix, run:\n\tpython -m black . --exclude ext_libs"; exit 1)
cpp-formatting:
name: c++.formatting
runs-on: ubuntu-20.04
Expand Down
56 changes: 32 additions & 24 deletions big_tests/testCode/ocr2vw.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,71 @@
#!/usr/bin/env python
# convert letter.data to letter.vw

def read_letter_names (fn):

def read_letter_names(fn):
ret = list()
with open(fn) as ins:
for line in ins:
ret.append(line.rstrip())
print "Read %d names from %s" % (len(ret),fn)
print("Read %d names from %s" % (len(ret), fn))
return ret

def find_pixel_start (names):

def find_pixel_start(names):
for i in range(len(names)):
if names[i].startswith("p_"):
return i
raise ValueError("No pixel data",names)
raise ValueError("No pixel data", names)


def data2vw (ifn, train, test, names):
def data2vw(ifn, train, test, names):
lineno = 0
trainN = 0
testN = 0
if ifn.endswith(".gz"):
import gzip

iopener = gzip.open
else:
iopener = open
id_pos = names.index("id")
letter_pos = names.index("letter")
pixel_start = find_pixel_start(names)
with iopener(ifn) as ins, open(train,"wb") as trainS, open(test,"wb") as testS:
with iopener(ifn) as ins, open(train, "wb") as trainS, open(test, "wb") as testS:
for line in ins:
lineno += 1
vals = line.rstrip().split('\t')
vals = line.rstrip().split("\t")
if len(vals) != len(names):
raise ValueError("Bad field count",
len(vals),len(names),vals,names)
raise ValueError("Bad field count", len(vals), len(names), vals, names)
char = vals[letter_pos]
if len(char) != 1:
raise ValueError("Bad letter",char)
raise ValueError("Bad letter", char)
if lineno % 10 == 0:
testN += 1
outs = testS
else:
trainN += 1
outs = trainS
outs.write("%d 1 %s-%s|Pixel" % (ord(char)-ord('a')+1,char,vals[id_pos]))
for i in range(pixel_start,len(names)):
if vals[i] != '0':
outs.write(' %s:%s' % (names[i],vals[i]))
outs.write('\n')
print "Read %d lines from %s; wrote %d lines into %s and %d lines into %s" % (
lineno,ifn,trainN,train,testN,test)
outs.write(
"%d 1 %s-%s|Pixel" % (ord(char) - ord("a") + 1, char, vals[id_pos])
)
for i in range(pixel_start, len(names)):
if vals[i] != "0":
outs.write(" %s:%s" % (names[i], vals[i]))
outs.write("\n")
print(
"Read %d lines from %s; wrote %d lines into %s and %d lines into %s"
% (lineno, ifn, trainN, train, testN, test)
)


if __name__ == '__main__':
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Convert letters.data to VW format')
parser.add_argument('input',help='path to letter.data[.gz]')
parser.add_argument('names',help='path to letter.names')
parser.add_argument('train',help='VW train file location (90%)')
parser.add_argument('test',help='VW test file location (10%)')

parser = argparse.ArgumentParser(description="Convert letters.data to VW format")
parser.add_argument("input", help="path to letter.data[.gz]")
parser.add_argument("names", help="path to letter.names")
parser.add_argument("train", help="VW train file location (90%)")
parser.add_argument("test", help="VW test file location (10%)")
args = parser.parse_args()
data2vw(args.input,args.train,args.test,read_letter_names(args.names))
data2vw(args.input, args.train, args.test, read_letter_names(args.names))
75 changes: 42 additions & 33 deletions demo/advertising/naive_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,87 +6,96 @@
from os import devnull

# The learning algorithm is vowpal wabbit, available at https://github.com/JohnLangford/vowpal_wabbit/wiki
vw_train_cmd = '../../vowpalwabbit/vw -c -f model --bfgs --passes 30 -b 22 --loss_function logistic --l2 14 --termination 0.00001 --holdout_off'
vw_test_cmd = '../../vowpalwabbit/vw -t -i model -p /dev/stdout'
vw_train_cmd = "../../vowpalwabbit/vw -c -f model --bfgs --passes 30 -b 22 --loss_function logistic --l2 14 --termination 0.00001 --holdout_off"
vw_test_cmd = "../../vowpalwabbit/vw -t -i model -p /dev/stdout"


def get_features(line):
feat = line[2:]
# Bucketizing the integer features on a logarithmic scale
for i in range(8):
if feat[i]:
if feat[i]:
v = int(feat[i])
if v>0:
feat[i] = str(int(log(v+0.5)/log(1.5)))
return ' '.join(['%d_%s' % (i,v) for i,v in enumerate(feat) if v])
if v > 0:
feat[i] = str(int(log(v + 0.5) / log(1.5)))
return " ".join(["%d_%s" % (i, v) for i, v in enumerate(feat) if v])


def train_test_oneday(day):
ts_beginning_test = 86400*(day-1)
ts_beginning_test = 86400 * (day - 1)

with open('data.txt') as f:
with open("data.txt") as f:
line = f.readline()

# Beginning of the training set: 3 weeks before the test period
while int(line.split()[0]) < ts_beginning_test - 86400*21:
while int(line.split()[0]) < ts_beginning_test - 86400 * 21:
line = f.readline()

call('rm -f .cache', shell=True)
call("rm -f .cache", shell=True)
vw = Popen(vw_train_cmd, shell=True, stdin=PIPE)

print '---------- Training on days %d to %d ----------------' % (day-21, day-1)
print
print(
"---------- Training on days %d to %d ----------------"
% (day - 21, day - 1)
)
print()

while int(line.split()[0]) < ts_beginning_test:
line = line[:-1].split('\t')
line = line[:-1].split("\t")

label = -1
if line[1]:
conv_ts = int(line[1])
if conv_ts < ts_beginning_test:
label = 1 # Positive label iff conversion and the conversion occured before the test period
if conv_ts < ts_beginning_test:
label = 1 # Positive label iff conversion and the conversion occured before the test period

out = '%d | %s' % (label, get_features(line))
print >>vw.stdin, out
out = "%d | %s" % (label, get_features(line))
print >> vw.stdin, out
line = f.readline()

vw.stdin.close()
vw.wait()

print
print '---------- Testing on day %d ----------------' % (day-21)

vw = Popen(vw_test_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=open(devnull, 'w'))
print()
print("---------- Testing on day %d ----------------" % (day - 21))

vw = Popen(
vw_test_cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=open(devnull, "w")
)
ll = 0
n = 0

# Test is one day long
while int(line.split()[0]) < ts_beginning_test + 86400:
line = line[:-1].split('\t')
line = line[:-1].split("\t")

print >>vw.stdin, '| '+get_features(line)
print >> vw.stdin, "| " + get_features(line)
dotproduct = float(vw.stdout.readline())

# Test log likelihood
if line[1]: # Positive example
ll += log(1+exp(-dotproduct))
else: # Negative sample
ll += log(1+exp(dotproduct))
if line[1]: # Positive example
ll += log(1 + exp(-dotproduct))
else: # Negative sample
ll += log(1 + exp(dotproduct))
n += 1

line = f.readline()

return (ll, n)


def main():
ll = 0
n = 0
# Iterating over the 7 test days
for day in range(54,61):
for day in range(54, 61):
ll_day, n_day = train_test_oneday(day)
ll += ll_day
n += n_day
print ll_day, n_day
print
print 'Average test log likelihood: %f' % (ll/n)

print(ll_day, n_day)
print()
print("Average test log likelihood: %f" % (ll / n))


if __name__ == "__main__":
main()
45 changes: 25 additions & 20 deletions demo/dependencyparsing/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,26 @@
import sys
from collections import defaultdict


def pc(num, den):
return (num / float(den+1e-100)) * 100
return (num / float(den + 1e-100)) * 100


def fmt_acc(label, n, l_corr, u_corr, total_errs):
l_pc = pc(l_corr, n)
u_pc = pc(u_corr, n)
err_pc = pc(n - l_corr, total_errs)
return '%s\t%d\t%.3f\t%.3f\t%.3f' % (label, n, l_pc, u_pc, err_pc)
return "%s\t%d\t%.3f\t%.3f\t%.3f" % (label, n, l_pc, u_pc, err_pc)


def gen_toks(loc):
sent_strs = open(str(loc)).read().strip().split('\n\n')
sent_strs = open(str(loc)).read().strip().split("\n\n")
token = None
i = 0
for sent_str in sent_strs:
tokens = [Token(i, tok_str.split()) for i, tok_str in enumerate(sent_str.split('\n'))]
tokens = [
Token(i, tok_str.split()) for i, tok_str in enumerate(sent_str.split("\n"))
]
for token in tokens:
yield sent_str, token

Expand All @@ -37,24 +41,24 @@ def __init__(self, id_, attrs):
new_attrs.append(attrs[-3])
attrs = new_attrs
self.label = attrs[-1]
if self.label.lower() == 'root':
self.label = 'ROOT'
if self.label.lower() == "root":
self.label = "ROOT"
try:
head = int(attrs[-2])
except:
try:
self.label = 'P'
self.label = "P"
head = int(attrs[-1])
except:
print attrs
print(attrs)
raise
attrs.pop()
attrs.pop()
self.head = head
self.pos = attrs.pop()
self.word = attrs.pop()
self.dir = 'R' if head >= 0 and head < self.id else 'L'
self.dir = "R" if head >= 0 and head < self.id else "L"


def mymain(test_loc, gold_loc, eval_punct=False):
if not os.path.exists(test_loc):
Expand All @@ -67,7 +71,7 @@ def mymain(test_loc, gold_loc, eval_punct=False):
l_nc = 0
for (sst, t), (ss, g) in zip(gen_toks(test_loc), gen_toks(gold_loc)):
if not eval_punct and g.word in ",.-;:'\"!?`{}()[]":
continue
continue
prev_g = g
prev_t = t
u_c = g.head == t.head
Expand All @@ -79,7 +83,7 @@ def mymain(test_loc, gold_loc, eval_punct=False):
u_by_label[g.dir][g.label] += u_c
l_by_label[g.dir][g.label] += l_c
n_l_err = N - l_nc
for D in ['L', 'R']:
for D in ["L", "R"]:
n_other = 0
l_other = 0
u_other = 0
Expand All @@ -93,12 +97,13 @@ def mymain(test_loc, gold_loc, eval_punct=False):
else:
l_corr = l_by_label[D][label]
u_corr = u_by_label[D][label]
yield 'U: %.3f' % pc(u_nc, N)
yield 'L: %.3f' % pc(l_nc, N)
yield "U: %.3f" % pc(u_nc, N)
yield "L: %.3f" % pc(l_nc, N)


if __name__ == '__main__':
if(sys.argv < 3):
print 'Usage: parsed_pred_file gold_test_conll_file'
sys.exit(0)
for line in mymain(sys.argv[1], sys.argv[2], eval_punct=False):
print line
if __name__ == "__main__":
if sys.argv < 3:
print("Usage: parsed_pred_file gold_test_conll_file")
sys.exit(0)
for line in mymain(sys.argv[1], sys.argv[2], eval_punct=False):
print(line)
Loading

0 comments on commit 104cb18

Please sign in to comment.