diff --git a/.gitignore b/.gitignore index 96374c4..d8344cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*.pyc + # Windows image file caches Thumbs.db ehthumbs.db diff --git a/createDictionaries.py b/createDictionaries.py index ece7cfe..44a8199 100644 --- a/createDictionaries.py +++ b/createDictionaries.py @@ -11,9 +11,9 @@ import cPickle import re from twokenize import tokenize +from utils import process_line from random import seed - seed(50) #optimization that is currently not used @@ -40,15 +40,6 @@ def read_random_line(f, chunk_size=128): return f_handle.readline() - -def is_number(s): - try: - float(s) - return True - except ValueError: - return False - - def diff_times_in_seconds(t1,t2,date1,date2): t1 = t1.split(':') t2 = t2.split(':') @@ -69,75 +60,6 @@ def diff_times_in_seconds(t1,t2,date1,date2): return t2_secs - t1_secs -def is_url(s): - return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') or s.startswith('ftps://') or s.startswith('smb://') - - -def replace_sentence(text): - if isinstance(text,basestring) == False: - return text - words = nltk.word_tokenize(text) - sent = nltk.pos_tag(words) - chunks = nltk.ne_chunk(sent, binary=False) - sentence = [] - nodelist = ['PERSON','ORGANIZATION','GPE','LOCATION','FACILITY','GSP'] - for c,word in zip(chunks, words): - changed = False - if hasattr(c, 'node'): - if c.node in nodelist: - sentence.append("__%s__" % c.node) - changed = True - if not changed: - if is_url(word): - sentence.append("__URL__") - elif is_number(word): - sentence.append("__NUMBER__") - elif os.path.isabs(word): - sentence.append("__PATH__") - else: - sentence.append(word) - return " ".join(sentence) - -def clean_str(string, TREC=False): - """ - Tokenization/string cleaning for all datasets except for SST. - Every dataset is lower cased except for TREC - """ - string = re.sub(r"\'m", " \'m", string) - string = re.sub(r"\'s", " \'s", string) - string = re.sub(r"\'ve", " \'ve", string) - string = re.sub(r"n\'t", " n\'t", string) - string = re.sub(r"\'re", " \'re", string) - string = re.sub(r"\'d", " \'d", string) - string = re.sub(r"\'ll", " \'ll", string) - string = re.sub(r"`", " ` ", string) - string = re.sub(r",", " , ", string) - string = string.replace('', '__EOS__') - return string.strip() - -def process_token(c, word): - nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP'] - if hasattr(c, 'label'): - if c.label() in nodelist: - return "__%s__" % c.label() - if is_url(word): - return "__URL__" - elif is_number(word): - return "__NUMBER__" - elif os.path.isabs(word): - return "__PATH__" - return word - -def process_line(s, clean_string=True): - if clean_string: - s = clean_str(s) - tokens = tokenize(s) - #return [process_token(None,token).lower() for token in tokens] - sent = nltk.pos_tag(tokens) - chunks = nltk.ne_chunk(sent, binary=False) - return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)] - - class CreateDataset: def __init__(self,path): diff --git a/createdataset.py b/createdataset.py index 83cfa83..eb53dbb 100644 --- a/createdataset.py +++ b/createdataset.py @@ -15,22 +15,10 @@ import cPickle import re from twokenize import tokenize +from utils import process_line seed(500) - -def is_number(s): - try: - float(s) - return True - except ValueError: - return False - -def is_url(s): - return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \ - or s.startswith('ftps://') or s.startswith('smb://') - - def diff_times_in_seconds(t1,t2,date1,date2): """ Returns the difference in time (in seconds) between two dates @@ -55,51 +43,6 @@ def diff_times_in_seconds(t1,t2,date1,date2): return t2_secs - t1_secs -def clean_str(string, TREC=False): - """ - Tokenization/string cleaning for all datasets except for SST. - Every dataset is lower cased except for TREC - """ - string = re.sub(r"\'m", " \'m", string) - string = re.sub(r"\'s", " \'s", string) - string = re.sub(r"\'ve", " \'ve", string) - string = re.sub(r"n\'t", " n\'t", string) - string = re.sub(r"\'re", " \'re", string) - string = re.sub(r"\'d", " \'d", string) - string = re.sub(r"\'ll", " \'ll", string) - string = re.sub(r"`", " ` ", string) - string = re.sub(r",", " , ", string) - return string.strip() - -def process_token(c, word): - """ - Use NLTK to replace named entities with generic tags. - Also replace URLs, numbers, and paths. - """ - nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP'] - if hasattr(c, 'label'): - if c.label() in nodelist: - return "__%s__" % c.label() - if is_url(word): - return "__URL__" - elif is_number(word): - return "__NUMBER__" - elif os.path.isabs(word): - return "__PATH__" - return word - -def process_line(s, clean_string=True): - """ - Processes a line by iteratively calling process_token. - """ - if clean_string: - s = clean_str(s) - tokens = tokenize(s) - sent = nltk.pos_tag(tokens) - chunks = nltk.ne_chunk(sent, binary=False) - return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)] - - class CreateDataset: def __init__(self,path): diff --git a/find_testfiles.py b/find_testfiles.py index 32385d6..f723e36 100644 --- a/find_testfiles.py +++ b/find_testfiles.py @@ -8,63 +8,7 @@ from twokenize import tokenize import nltk from sklearn.externals import joblib - - - -def is_number(s): - try: - float(s) - return True - except ValueError: - return False - -def is_url(s): - return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \ - or s.startswith('ftps://') or s.startswith('smb://') - -def clean_str(string, TREC=False): - """ - Tokenization/string cleaning for all datasets except for SST. - Every dataset is lower cased except for TREC - """ - string = re.sub(r"\'m", " \'m", string) - string = re.sub(r"\'s", " \'s", string) - string = re.sub(r"\'ve", " \'ve", string) - string = re.sub(r"n\'t", " n\'t", string) - string = re.sub(r"\'re", " \'re", string) - string = re.sub(r"\'d", " \'d", string) - string = re.sub(r"\'ll", " \'ll", string) - string = re.sub(r"`", " ` ", string) - string = re.sub(r",", " , ", string) - return string.strip() - -def process_token(c, word): - """ - Use NLTK to replace named entities with generic tags. - Also replace URLs, numbers, and paths. - """ - nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP'] - if hasattr(c, 'label'): - if c.label() in nodelist: - return "__%s__" % c.label() - if is_url(word): - return "__URL__" - elif is_number(word): - return "__NUMBER__" - elif os.path.isabs(word): - return "__PATH__" - return word - -def process_line(s, clean_string=True): - """ - Processes a line by iteratively calling process_token. - """ - if clean_string: - s = clean_str(s) - tokens = tokenize(s) - sent = nltk.pos_tag(tokens) - chunks = nltk.ne_chunk(sent, binary=False) - return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)] +from utils import process_line def writeFiles(csvname, data, listbool=False, overwrite=False): """ diff --git a/find_testfiles2.py b/find_testfiles2.py index 48798f7..b21f4f1 100644 --- a/find_testfiles2.py +++ b/find_testfiles2.py @@ -8,63 +8,7 @@ from twokenize import tokenize import nltk from sklearn.externals import joblib - - - -def is_number(s): - try: - float(s) - return True - except ValueError: - return False - -def is_url(s): - return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \ - or s.startswith('ftps://') or s.startswith('smb://') - -def clean_str(string, TREC=False): - """ - Tokenization/string cleaning for all datasets except for SST. - Every dataset is lower cased except for TREC - """ - string = re.sub(r"\'m", " \'m", string) - string = re.sub(r"\'s", " \'s", string) - string = re.sub(r"\'ve", " \'ve", string) - string = re.sub(r"n\'t", " n\'t", string) - string = re.sub(r"\'re", " \'re", string) - string = re.sub(r"\'d", " \'d", string) - string = re.sub(r"\'ll", " \'ll", string) - string = re.sub(r"`", " ` ", string) - string = re.sub(r",", " , ", string) - return string.strip() - -def process_token(c, word): - """ - Use NLTK to replace named entities with generic tags. - Also replace URLs, numbers, and paths. - """ - nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP'] - if hasattr(c, 'label'): - if c.label() in nodelist: - return "__%s__" % c.label() - if is_url(word): - return "__URL__" - elif is_number(word): - return "__NUMBER__" - elif os.path.isabs(word): - return "__PATH__" - return word - -def process_line(s, clean_string=True): - """ - Processes a line by iteratively calling process_token. - """ - if clean_string: - s = clean_str(s) - tokens = tokenize(s) - sent = nltk.pos_tag(tokens) - chunks = nltk.ne_chunk(sent, binary=False) - return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)] +from utils import process_line def writeFiles(csvname, data, listbool=False, overwrite=False): """ diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..3d64de3 --- /dev/null +++ b/utils.py @@ -0,0 +1,72 @@ +import itertools +import nltk +import os +import re +from twokenize import tokenize + +def is_number(s): + try: + float(s) + return True + except ValueError: + return False + +def is_url(s): + return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \ + or s.startswith('ftps://') or s.startswith('smb://') + +def clean_str(string, TREC=False): + """ + Tokenization/string cleaning for all datasets except for SST. + Every dataset is lower cased except for TREC + """ + string = re.sub(r"\'m", " \'m", string) + string = re.sub(r"\'s", " \'s", string) + string = re.sub(r"\'ve", " \'ve", string) + string = re.sub(r"n\'t", " n\'t", string) + string = re.sub(r"\'re", " \'re", string) + string = re.sub(r"\'d", " \'d", string) + string = re.sub(r"\'ll", " \'ll", string) + string = re.sub(r"`", " ` ", string) + string = re.sub(r",", " , ", string) + return string.strip() + +def process_token(c): + """ + Use NLTK to replace named entities with generic tags. + Also replace URLs, numbers, and paths. + """ + nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP'] + if hasattr(c, 'label'): + if c.label() in nodelist: + return "__%s__" % c.label() + word = ' '.join([t[0] for t in c.leaves()]) if isinstance(c, nltk.tree.Tree) else c[0] + if is_url(word): + return "__URL__" + elif is_number(word): + return "__NUMBER__" + elif os.path.isabs(word): + return "__PATH__" + return word + +def process_line(s, clean_string=True): + """ + Processes a line by iteratively calling process_token. + """ + if clean_string: + s = clean_str(s) + tokens = tokenize(s) + sent = nltk.pos_tag(tokens) + chunks = nltk.ne_chunk(sent, binary=False) + return [process_token(c).lower().encode('UTF-8') for c in chunks] + +def test(): + s=''' + hi, please some1 can help me with my driver in ubuntu :( its a intel GM965 i tried compiz, but give me the error, Checking for Xgl: not present. Blacklisted PCIID '8086:2a02' found aborting and using fallback: /usr/bin/metacity some1 can help me please :( what kind of video card are you running? if you're not sure exactly, lspci | grep -i vga will tell you nickrud 00:02.0 VGA compatible controller: Intel Corporation Mobile GM965/GL960 Integrated Graphics Controller (rev 03) http://wiki.compiz-fusion.org/Hardware/Blacklist nickrud ty i go try it + ''' + + print process_line(s) + +if __name__ == '__main__': + test() +