Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.pyc

# Windows image file caches
Thumbs.db
ehthumbs.db
Expand Down
80 changes: 1 addition & 79 deletions createDictionaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
import cPickle
import re
from twokenize import tokenize
from utils import process_line

from random import seed

seed(50)

#optimization that is currently not used
Expand All @@ -40,15 +40,6 @@ def read_random_line(f, chunk_size=128):
return f_handle.readline()



def is_number(s):
try:
float(s)
return True
except ValueError:
return False


def diff_times_in_seconds(t1,t2,date1,date2):
t1 = t1.split(':')
t2 = t2.split(':')
Expand All @@ -69,75 +60,6 @@ def diff_times_in_seconds(t1,t2,date1,date2):
return t2_secs - t1_secs


def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') or s.startswith('ftps://') or s.startswith('smb://')


def replace_sentence(text):
if isinstance(text,basestring) == False:
return text
words = nltk.word_tokenize(text)
sent = nltk.pos_tag(words)
chunks = nltk.ne_chunk(sent, binary=False)
sentence = []
nodelist = ['PERSON','ORGANIZATION','GPE','LOCATION','FACILITY','GSP']
for c,word in zip(chunks, words):
changed = False
if hasattr(c, 'node'):
if c.node in nodelist:
sentence.append("__%s__" % c.node)
changed = True
if not changed:
if is_url(word):
sentence.append("__URL__")
elif is_number(word):
sentence.append("__NUMBER__")
elif os.path.isabs(word):
sentence.append("__PATH__")
else:
sentence.append(word)
return " ".join(sentence)

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
string = string.replace('</s>', '__EOS__')
return string.strip()

def process_token(c, word):
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
#return [process_token(None,token).lower() for token in tokens]
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]


class CreateDataset:

def __init__(self,path):
Expand Down
59 changes: 1 addition & 58 deletions createdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,10 @@
import cPickle
import re
from twokenize import tokenize
from utils import process_line

seed(500)


def is_number(s):
try:
float(s)
return True
except ValueError:
return False

def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
or s.startswith('ftps://') or s.startswith('smb://')


def diff_times_in_seconds(t1,t2,date1,date2):
"""
Returns the difference in time (in seconds) between two dates
Expand All @@ -55,51 +43,6 @@ def diff_times_in_seconds(t1,t2,date1,date2):
return t2_secs - t1_secs


def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
return string.strip()

def process_token(c, word):
"""
Use NLTK to replace named entities with generic tags.
Also replace URLs, numbers, and paths.
"""
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]


class CreateDataset:

def __init__(self,path):
Expand Down
58 changes: 1 addition & 57 deletions find_testfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,63 +8,7 @@
from twokenize import tokenize
import nltk
from sklearn.externals import joblib



def is_number(s):
try:
float(s)
return True
except ValueError:
return False

def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
or s.startswith('ftps://') or s.startswith('smb://')

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
return string.strip()

def process_token(c, word):
"""
Use NLTK to replace named entities with generic tags.
Also replace URLs, numbers, and paths.
"""
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
from utils import process_line

def writeFiles(csvname, data, listbool=False, overwrite=False):
"""
Expand Down
58 changes: 1 addition & 57 deletions find_testfiles2.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,63 +8,7 @@
from twokenize import tokenize
import nltk
from sklearn.externals import joblib



def is_number(s):
try:
float(s)
return True
except ValueError:
return False

def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
or s.startswith('ftps://') or s.startswith('smb://')

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
return string.strip()

def process_token(c, word):
"""
Use NLTK to replace named entities with generic tags.
Also replace URLs, numbers, and paths.
"""
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
from utils import process_line

def writeFiles(csvname, data, listbool=False, overwrite=False):
"""
Expand Down
72 changes: 72 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import itertools
import nltk
import os
import re
from twokenize import tokenize

def is_number(s):
try:
float(s)
return True
except ValueError:
return False

def is_url(s):
return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
or s.startswith('ftps://') or s.startswith('smb://')

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"`", " ` ", string)
string = re.sub(r",", " , ", string)
return string.strip()

def process_token(c):
"""
Use NLTK to replace named entities with generic tags.
Also replace URLs, numbers, and paths.
"""
nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
if hasattr(c, 'label'):
if c.label() in nodelist:
return "__%s__" % c.label()
word = c[0]
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use c.leaves() to flatten the words in a tree. Maybe its better?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's a tree and it has a label property, we can simply return the label. I've added a check in cea46ad to handle the case if the tree doesn't have a label.

if is_url(word):
return "__URL__"
elif is_number(word):
return "__NUMBER__"
elif os.path.isabs(word):
return "__PATH__"
return word

def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c).lower().encode('UTF-8') for c in chunks]

def test():
s='''
hi, please some1 can help me with my driver in ubuntu :( its a intel GM965 i tried compiz, but give me the error, Checking for Xgl: not present. Blacklisted PCIID '8086:2a02' found aborting and using fallback: /usr/bin/metacity some1 can help me please :( what kind of video card are you running? if you're not sure exactly, lspci | grep -i vga will tell you nickrud 00:02.0 VGA compatible controller: Intel Corporation Mobile GM965/GL960 Integrated Graphics Controller (rev 03) http://wiki.compiz-fusion.org/Hardware/Blacklist nickrud ty i go try it
'''

print process_line(s)

if __name__ == '__main__':
test()