diff --git a/requirements.txt b/requirements.txt index 0bdad8f..a2332d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ rq python-dotenv numpy datasketch -python-dotenv \ No newline at end of file +mysql-connector-python \ No newline at end of file diff --git a/stackoversight/pipeline/pipelineoutput.py b/stackoversight/pipeline/pipelineoutput.py index 262278e..b1f742a 100644 --- a/stackoversight/pipeline/pipelineoutput.py +++ b/stackoversight/pipeline/pipelineoutput.py @@ -50,10 +50,11 @@ def print(self, index=None): def form_lsh(self): minhash = [] - for s in self.__items: + for list_keywords in self.__items: m = MinHash(num_perm=256) - for q in s: - m.update(q.encode('utf8')) + for result in list_keywords: + if result is not None: + m.update(result.encode('utf8')) minhash.append(m) forest = MinHashLSHForest(num_perm=256) @@ -93,7 +94,8 @@ def query(self, item=None): m = MinHash(num_perm=256) for s in item: - m.update(s.encode('utf8')) + if s is not None: + m.update(s.encode('utf8')) query = self.__forest.query(m, 5) out = np.array(query) self.__hash_results = query diff --git a/stackoversight/test.py b/stackoversight/test.py index 08d6500..d311e90 100644 --- a/stackoversight/test.py +++ b/stackoversight/test.py @@ -7,48 +7,52 @@ import asyncio import rq -# TODO: tuples instead - -code_base = "for i in range(1,11):\n" \ - " n = 1\n" \ - " for j in range(1, i+1):\n" \ - " n *= j\n" \ - " print('{}! = {}'.format(i, n))" - -code_sample = "for i in range(25,100):\n" \ - " n = 3\n" \ - " # This is a comment!\n" \ - " # Notice how it is found as a 'NONE'!\n" \ - " for j in range(2, i+5):\n" \ - " n *= j\n" \ - " # Another comment!\n" \ - " print('{}! = {}'.format(i, n))" - -code_samplet = """for i in len(array): - k = 12 - j = 10 - for n in range(1, len(array) + i): - j *= n - k = k + i - other_func_call() - print('{}! = {}'.format(i,n)) - print('Done!') -""" - -code_samplet2 = """def levenshtein(seq1, seq2): - size_x = len(seq1) + 1 - size_y = len(seq2) + 1 - matrix = np.zeros ((size_x, size_y)) - for x in range(size_x): - matrix [x, 0] = x - for y in range(size_y): - matrix [0, y] = y - return matrix -""" - -not_code = "This is an example of\nsomething that is not even a code snippet!\n" \ - " it contains code such as: for i in range(1, 10):\n" \ - " But it would never compile." +samples = [ + """ + for i in range(1,11): + n = 1 + for j in range(1, i+1): + n *= j + print('{}! = {}'.format(i, n)) + """, + """ + for i in range(25,100): + n = 3 + # This is a comment! + # Notice how it is found as a 'NONE'! + for j in range(2, i+5): + n *= j + # Another comment! + print('{}! = {}'.format(i, n)) + """, + """for i in len(array): + k = 12 + j = 10 + for n in range(1, len(array) + i): + j *= n + k = k + i + other_func_call() + print('{}! = {}'.format(i,n)) + print('Done!') + """, + """def levenshtein(seq1, seq2): + size_x = len(seq1) + 1 + size_y = len(seq2) + 1 + matrix = np.zeros ((size_x, size_y)) + for x in range(size_x): + matrix [x, 0] = x + for y in range(size_y): + matrix [0, y] = y + return matrix + """, + """This is an example of + something that is not even a code snippet! + it contains code such as: + for i in range(1, 10): + But it would never compile. + """, + "isn't code" +] # Set the pipeline steps up into the correct order processing_steps = [ @@ -58,16 +62,8 @@ KeywordExtractor() ] -snippets = [ - code_base, - code_sample, - code_samplet, - code_samplet2, - not_code -] - pipeline = Pipeline(processing_steps) -output = pipeline.execute_synchronous(snippets) +output = pipeline.execute_synchronous(samples) output.form_lsh() output.set_input(output[0]) query_out = output.query()