Skip to content

Commit 6fc7c34

Browse files
authored
Merge pull request #216 from bact/dev
Clean tnc_freq.txt
2 parents 277d4a4 + 16cfbb1 commit 6fc7c34

File tree

4 files changed

+19
-26757
lines changed

4 files changed

+19
-26757
lines changed

pythainlp/corpus/tnc.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ def word_freq(word: str, domain: str = "all") -> int:
2323
This function will make a query to the server of Thai National Corpus.
2424
Internet connection is required.
2525
26-
**IMPORTANT:** Currently (as of 29 April 2019) always return 0,
27-
as the service URL has been changed and the code is not updated yet.
26+
**IMPORTANT:** Currently (as of 29 April 2019) it is likely to return 0,
27+
regardless of the word, as the service URL has been changed and the code
28+
is not updated yet.
29+
New URL is http://www.arts.chula.ac.th/~ling/tnc3/
2830
2931
:param string word: word
3032
:param string domain: domain
@@ -42,8 +44,7 @@ def word_freq(word: str, domain: str = "all") -> int:
4244
"leisure": "9",
4345
"others": "0",
4446
}
45-
url = "http://www.arts.chula.ac.th/~ling/TNCII/corp.php"
46-
# New URL is http://www.arts.chula.ac.th/~ling/tnc3/
47+
url = "http://www.arts.chula.ac.th/~ling/tnc3/"
4748
data = {"genre[]": "", "domain[]": listdomain[domain], "sortby": "perc", "p": word}
4849

4950
r = requests.post(url, data=data)
@@ -63,9 +64,10 @@ def word_freqs() -> List[Tuple[str, int]]:
6364
Get word frequency from Thai National Corpus (TNC)
6465
"""
6566
lines = list(get_corpus(_FILENAME))
66-
listword = []
67+
word_freqs = []
6768
for line in lines:
68-
listindata = line.split("\t")
69-
listword.append((listindata[0], int(listindata[1])))
69+
word_freq = line.split("\t")
70+
if len(word_freq) >= 2:
71+
word_freqs.append((word_freq[0], int(word_freq[1])))
7072

71-
return listword
73+
return word_freqs

0 commit comments

Comments
 (0)