Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ rq
python-dotenv
numpy
datasketch
python-dotenv
mysql-connector-python
10 changes: 6 additions & 4 deletions stackoversight/pipeline/pipelineoutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,11 @@ def print(self, index=None):
def form_lsh(self):
minhash = []

for s in self.__items:
for list_keywords in self.__items:
m = MinHash(num_perm=256)
for q in s:
m.update(q.encode('utf8'))
for result in list_keywords:
if result is not None:
m.update(result.encode('utf8'))
minhash.append(m)

forest = MinHashLSHForest(num_perm=256)
Expand Down Expand Up @@ -93,7 +94,8 @@ def query(self, item=None):

m = MinHash(num_perm=256)
for s in item:
m.update(s.encode('utf8'))
if s is not None:
m.update(s.encode('utf8'))
query = self.__forest.query(m, 5)
out = np.array(query)
self.__hash_results = query
Expand Down
98 changes: 47 additions & 51 deletions stackoversight/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,48 +7,52 @@
import asyncio
import rq

# TODO: tuples instead

code_base = "for i in range(1,11):\n" \
" n = 1\n" \
" for j in range(1, i+1):\n" \
" n *= j\n" \
" print('{}! = {}'.format(i, n))"

code_sample = "for i in range(25,100):\n" \
" n = 3\n" \
" # This is a comment!\n" \
" # Notice how it is found as a 'NONE'!\n" \
" for j in range(2, i+5):\n" \
" n *= j\n" \
" # Another comment!\n" \
" print('{}! = {}'.format(i, n))"

code_samplet = """for i in len(array):
k = 12
j = 10
for n in range(1, len(array) + i):
j *= n
k = k + i
other_func_call()
print('{}! = {}'.format(i,n))
print('Done!')
"""

code_samplet2 = """def levenshtein(seq1, seq2):
size_x = len(seq1) + 1
size_y = len(seq2) + 1
matrix = np.zeros ((size_x, size_y))
for x in range(size_x):
matrix [x, 0] = x
for y in range(size_y):
matrix [0, y] = y
return matrix
"""

not_code = "This is an example of\nsomething that is not even a code snippet!\n" \
" it contains code such as: for i in range(1, 10):\n" \
" But it would never compile."
samples = [
"""
for i in range(1,11):
n = 1
for j in range(1, i+1):
n *= j
print('{}! = {}'.format(i, n))
""",
"""
for i in range(25,100):
n = 3
# This is a comment!
# Notice how it is found as a 'NONE'!
for j in range(2, i+5):
n *= j
# Another comment!
print('{}! = {}'.format(i, n))
""",
"""for i in len(array):
k = 12
j = 10
for n in range(1, len(array) + i):
j *= n
k = k + i
other_func_call()
print('{}! = {}'.format(i,n))
print('Done!')
""",
"""def levenshtein(seq1, seq2):
size_x = len(seq1) + 1
size_y = len(seq2) + 1
matrix = np.zeros ((size_x, size_y))
for x in range(size_x):
matrix [x, 0] = x
for y in range(size_y):
matrix [0, y] = y
return matrix
""",
"""This is an example of
something that is not even a code snippet!
it contains code such as:
for i in range(1, 10):
But it would never compile.
""",
"isn't code"
]

# Set the pipeline steps up into the correct order
processing_steps = [
Expand All @@ -58,16 +62,8 @@
KeywordExtractor()
]

snippets = [
code_base,
code_sample,
code_samplet,
code_samplet2,
not_code
]

pipeline = Pipeline(processing_steps)
output = pipeline.execute_synchronous(snippets)
output = pipeline.execute_synchronous(samples)
output.form_lsh()
output.set_input(output[0])
query_out = output.query()
Expand Down