Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
GINK03 committed Feb 3, 2018
1 parent 18da4c9 commit 7a7a897
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 17 deletions.
2 changes: 1 addition & 1 deletion 10-parse-htmls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def _map(arg):

objs = []
for term in terms:
for type in ["1", "2"]:
for type in ["1", "2", "3"]:
try:
html = ( open('bing-ranking-scrape/htmls/{term}_{type}'.format(term=term, type=type)).read() )
except Exception as ex:
Expand Down
39 changes: 23 additions & 16 deletions 30-index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,41 @@

from collections import Counter

term_id = json.loads(open("term_id.json").read())
wakatis = json.loads(open("wakatis.json").read())

data = []
query_data = {}
for obj in wakatis:
type = obj["type"]
term = obj["term"]

if type == "top":
if type == "1":
rank = 4
elif type == "2":
rank = 2
else:
rank = 1
rank = 0
wakati = dict(Counter(obj["wakati"]))
data.append( (term, rank, wakati) )
if query_data.get(term) is None:
query_data[term] = []
query_data[term].append( (rank, wakati) )

term_index = {}
for term, rank, wakati in data:
for term, freq in wakati.items():
if term_index.get(term) is None:
term_index[term] = 0
term_index[term] += 1
for query, data in query_data.items():
for rank, wakati in data:
for term, freq in wakati.items():
if term_index.get(term) is None:
term_index[term] = 0
term_index[term] += 1

open("term_index.json", "w").write( json.dumps(term_index, indent=2, ensure_ascii=False) )

g = open("./rank/train.data", "w")
fdata = open("./rank/train.data", "w")
fgroup = open("./rank/train.group", "w")

query = set()
for term, rank, wakati in data:
line = " ".join( ["%d:%d"%(term_index[term], freq) for term, freq in wakati.items()] )
print(rank, line)
g.write( str(rank) + " " + "qid:%d"%(term_id[term]) + " " + line + "\n" )
for query, data in query_data.items():
size = len(data)
fgroup.write( '%d\n'%(size) )
for rank, wakati in data:
line = " ".join( ["%d:%d"%(term_index[term], freq) for term, freq in wakati.items()] )
print(rank, line)
fdata.write( str(rank) + " " + line + "\n" )

0 comments on commit 7a7a897

Please sign in to comment.