ryan-lowe · npow · Nov 5, 2015 · Nov 5, 2015 · pl8787 · Nov 5, 2015
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+*.pyc
+
 # Windows image file caches
 Thumbs.db
 ehthumbs.db

diff --git a/createDictionaries.py b/createDictionaries.py
@@ -11,9 +11,9 @@
 import cPickle
 import re
 from twokenize import tokenize
+from utils import process_line
 
 from random import seed
-
 seed(50)
 
 #optimization that is currently not used
@@ -40,15 +40,6 @@ def read_random_line(f, chunk_size=128):
         return f_handle.readline()
 
 
-
-def is_number(s):
-  try:
-    float(s)
-    return True
-  except ValueError:
-    return False
-
-
 def diff_times_in_seconds(t1,t2,date1,date2):
   t1 = t1.split(':')
   t2 = t2.split(':')
@@ -69,75 +60,6 @@ def diff_times_in_seconds(t1,t2,date1,date2):
   return t2_secs - t1_secs
 
 
-def is_url(s):
-    return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') or s.startswith('ftps://') or s.startswith('smb://')
-
-
-def replace_sentence(text):
-    if isinstance(text,basestring) == False:
-      return text
-    words = nltk.word_tokenize(text)
-    sent = nltk.pos_tag(words)
-    chunks = nltk.ne_chunk(sent, binary=False)
-    sentence = []
-    nodelist = ['PERSON','ORGANIZATION','GPE','LOCATION','FACILITY','GSP']
-    for c,word in zip(chunks, words):
-        changed = False
-        if hasattr(c, 'node'):     
-            if c.node in nodelist:
-                sentence.append("__%s__" % c.node) 
-                changed = True
-        if not changed:
-          if is_url(word):
-              sentence.append("__URL__")
-          elif is_number(word):
-              sentence.append("__NUMBER__")
-          elif os.path.isabs(word):
-              sentence.append("__PATH__")
-          else:
-            sentence.append(word)           
-    return " ".join(sentence)            
-
-def clean_str(string, TREC=False):
-    """
-    Tokenization/string cleaning for all datasets except for SST.
-    Every dataset is lower cased except for TREC
-    """
-    string = re.sub(r"\'m", " \'m", string) 
-    string = re.sub(r"\'s", " \'s", string) 
-    string = re.sub(r"\'ve", " \'ve", string) 
-    string = re.sub(r"n\'t", " n\'t", string) 
-    string = re.sub(r"\'re", " \'re", string) 
-    string = re.sub(r"\'d", " \'d", string) 
-    string = re.sub(r"\'ll", " \'ll", string) 
-    string = re.sub(r"`", " ` ", string)
-    string = re.sub(r",", " , ", string) 
-    string = string.replace('</s>', '__EOS__')
-    return string.strip() 
-
-def process_token(c, word):
-    nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
-    if hasattr(c, 'label'):
-        if c.label() in nodelist:
-            return "__%s__" % c.label()
-    if is_url(word):
-        return "__URL__"
-    elif is_number(word):
-        return "__NUMBER__"
-    elif os.path.isabs(word):
-        return "__PATH__"
-    return word
-
-def process_line(s, clean_string=True):
-    if clean_string:
-        s = clean_str(s)
-    tokens = tokenize(s)
-    #return [process_token(None,token).lower() for token in tokens]
-    sent = nltk.pos_tag(tokens)
-    chunks = nltk.ne_chunk(sent, binary=False)
-    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
-
-
 class CreateDataset:
 
   def __init__(self,path):

diff --git a/createdataset.py b/createdataset.py
@@ -15,22 +15,10 @@
 import cPickle
 import re
 from twokenize import tokenize
+from utils import process_line
 
 seed(500)
 
-
-def is_number(s):
-  try:
-    float(s)
-    return True
-  except ValueError:
-    return False
-
-def is_url(s):
-    return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
-            or s.startswith('ftps://') or s.startswith('smb://')
-
-
 def diff_times_in_seconds(t1,t2,date1,date2):
   """
   Returns the difference in time (in seconds) between two dates
@@ -55,51 +43,6 @@ def diff_times_in_seconds(t1,t2,date1,date2):
   return t2_secs - t1_secs
 
 
-def clean_str(string, TREC=False):
-    """
-    Tokenization/string cleaning for all datasets except for SST.
-    Every dataset is lower cased except for TREC
-    """
-    string = re.sub(r"\'m", " \'m", string) 
-    string = re.sub(r"\'s", " \'s", string) 
-    string = re.sub(r"\'ve", " \'ve", string) 
-    string = re.sub(r"n\'t", " n\'t", string) 
-    string = re.sub(r"\'re", " \'re", string) 
-    string = re.sub(r"\'d", " \'d", string) 
-    string = re.sub(r"\'ll", " \'ll", string) 
-    string = re.sub(r"`", " ` ", string)
-    string = re.sub(r",", " , ", string) 
-    return string.strip() 
-
-def process_token(c, word):
-    """
-    Use NLTK to replace named entities with generic tags.
-    Also replace URLs, numbers, and paths.
-    """
-    nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
-    if hasattr(c, 'label'):
-        if c.label() in nodelist:
-            return "__%s__" % c.label()
-    if is_url(word):
-        return "__URL__"
-    elif is_number(word):
-        return "__NUMBER__"
-    elif os.path.isabs(word):
-        return "__PATH__"
-    return word
-
-def process_line(s, clean_string=True):
-    """
-    Processes a line by iteratively calling process_token.
-    """
-    if clean_string:
-        s = clean_str(s)
-    tokens = tokenize(s)
-    sent = nltk.pos_tag(tokens)
-    chunks = nltk.ne_chunk(sent, binary=False)
-    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
-
-
 class CreateDataset:
 
   def __init__(self,path):

diff --git a/find_testfiles.py b/find_testfiles.py
@@ -8,63 +8,7 @@
 from twokenize import tokenize
 import nltk
 from sklearn.externals import joblib
-
-
-
-def is_number(s):
-  try:
-    float(s)
-    return True
-  except ValueError:
-    return False
-
-def is_url(s):
-    return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
-            or s.startswith('ftps://') or s.startswith('smb://')
-
-def clean_str(string, TREC=False):
-    """
-    Tokenization/string cleaning for all datasets except for SST.
-    Every dataset is lower cased except for TREC
-    """
-    string = re.sub(r"\'m", " \'m", string) 
-    string = re.sub(r"\'s", " \'s", string) 
-    string = re.sub(r"\'ve", " \'ve", string) 
-    string = re.sub(r"n\'t", " n\'t", string) 
-    string = re.sub(r"\'re", " \'re", string) 
-    string = re.sub(r"\'d", " \'d", string) 
-    string = re.sub(r"\'ll", " \'ll", string) 
-    string = re.sub(r"`", " ` ", string)
-    string = re.sub(r",", " , ", string) 
-    return string.strip() 
-
-def process_token(c, word):
-    """
-    Use NLTK to replace named entities with generic tags.
-    Also replace URLs, numbers, and paths.
-    """
-    nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
-    if hasattr(c, 'label'):
-        if c.label() in nodelist:
-            return "__%s__" % c.label()
-    if is_url(word):
-        return "__URL__"
-    elif is_number(word):
-        return "__NUMBER__"
-    elif os.path.isabs(word):
-        return "__PATH__"
-    return word
-
-def process_line(s, clean_string=True):
-    """
-    Processes a line by iteratively calling process_token.
-    """
-    if clean_string:
-        s = clean_str(s)
-    tokens = tokenize(s)
-    sent = nltk.pos_tag(tokens)
-    chunks = nltk.ne_chunk(sent, binary=False)
-    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
+from utils import process_line
 
 def writeFiles(csvname, data, listbool=False, overwrite=False):
 	"""

diff --git a/find_testfiles2.py b/find_testfiles2.py
@@ -8,63 +8,7 @@
 from twokenize import tokenize
 import nltk
 from sklearn.externals import joblib
-
-
-
-def is_number(s):
-  try:
-    float(s)
-    return True
-  except ValueError:
-    return False
-
-def is_url(s):
-    return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
-            or s.startswith('ftps://') or s.startswith('smb://')
-
-def clean_str(string, TREC=False):
-    """
-    Tokenization/string cleaning for all datasets except for SST.
-    Every dataset is lower cased except for TREC
-    """
-    string = re.sub(r"\'m", " \'m", string) 
-    string = re.sub(r"\'s", " \'s", string) 
-    string = re.sub(r"\'ve", " \'ve", string) 
-    string = re.sub(r"n\'t", " n\'t", string) 
-    string = re.sub(r"\'re", " \'re", string) 
-    string = re.sub(r"\'d", " \'d", string) 
-    string = re.sub(r"\'ll", " \'ll", string) 
-    string = re.sub(r"`", " ` ", string)
-    string = re.sub(r",", " , ", string) 
-    return string.strip() 
-
-def process_token(c, word):
-    """
-    Use NLTK to replace named entities with generic tags.
-    Also replace URLs, numbers, and paths.
-    """
-    nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
-    if hasattr(c, 'label'):
-        if c.label() in nodelist:
-            return "__%s__" % c.label()
-    if is_url(word):
-        return "__URL__"
-    elif is_number(word):
-        return "__NUMBER__"
-    elif os.path.isabs(word):
-        return "__PATH__"
-    return word
-
-def process_line(s, clean_string=True):
-    """
-    Processes a line by iteratively calling process_token.
-    """
-    if clean_string:
-        s = clean_str(s)
-    tokens = tokenize(s)
-    sent = nltk.pos_tag(tokens)
-    chunks = nltk.ne_chunk(sent, binary=False)
-    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
+from utils import process_line
 
 def writeFiles(csvname, data, listbool=False, overwrite=False):
 	"""

diff --git a/utils.py b/utils.py
@@ -0,0 +1,72 @@
+import itertools
+import nltk
+import os
+import re
+from twokenize import tokenize
+
+def is_number(s):
+  try:
+    float(s)
+    return True
+  except ValueError:
+    return False
+
+def is_url(s):
+    return s.startswith('http://') or s.startswith('https://') or s.startswith('ftp://') \
+            or s.startswith('ftps://') or s.startswith('smb://')
+
+def clean_str(string, TREC=False):
+    """
+    Tokenization/string cleaning for all datasets except for SST.
+    Every dataset is lower cased except for TREC
+    """
+    string = re.sub(r"\'m", " \'m", string) 
+    string = re.sub(r"\'s", " \'s", string) 
+    string = re.sub(r"\'ve", " \'ve", string) 
+    string = re.sub(r"n\'t", " n\'t", string) 
+    string = re.sub(r"\'re", " \'re", string) 
+    string = re.sub(r"\'d", " \'d", string) 
+    string = re.sub(r"\'ll", " \'ll", string) 
+    string = re.sub(r"`", " ` ", string)
+    string = re.sub(r",", " , ", string) 
+    return string.strip() 
+
+def process_token(c):
+    """
+    Use NLTK to replace named entities with generic tags.
+    Also replace URLs, numbers, and paths.
+    """
+    nodelist = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']
+    if hasattr(c, 'label'):
+        if c.label() in nodelist:
+            return "__%s__" % c.label()
+    word = c[0]
+    if is_url(word):
+        return "__URL__"
+    elif is_number(word):
+        return "__NUMBER__"
+    elif os.path.isabs(word):
+        return "__PATH__"
+    return word
+
+def process_line(s, clean_string=True):
+    """
+    Processes a line by iteratively calling process_token.
+    """
+    if clean_string:
+        s = clean_str(s)
+    tokens = tokenize(s)
+    sent = nltk.pos_tag(tokens)
+    chunks = nltk.ne_chunk(sent, binary=False)
+    return [process_token(c).lower().encode('UTF-8') for c in chunks]
+
+def test():
+    s='''
+    hi, please some1 can help me with my driver in ubuntu :( its a intel GM965 i tried compiz, but give me the error, Checking for Xgl: not present. Blacklisted PCIID '8086:2a02' found aborting and using fallback: /usr/bin/metacity some1 can help me please :( what kind of video card are you running? if you're not sure exactly, lspci | grep -i vga will tell you nickrud 00:02.0 VGA compatible controller: Intel Corporation Mobile GM965/GL960 Integrated Graphics Controller (rev 03) http://wiki.compiz-fusion.org/Hardware/Blacklist nickrud ty i go try it
+    '''
+
+    print process_line(s)
+
+if __name__ == '__main__':
+    test()
+