-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
150 lines (108 loc) · 5.83 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import re
import pandas as pd
import operator
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
# Add 'datatype' column that indicates if the record is original wiki answer as 0, training data 1, test data 2, onto
# the dataframe - uses stratified random sampling (with seed) to sample by task & plagiarism amount
# Use function to label datatype for training 1 or test 2
def create_datatype(df, train_value, test_value, datatype_var, compare_dfcolumn, operator_of_compare, value_of_compare,
sampling_number, sampling_seed):
# Subsets dataframe by condition relating to statement built from:
# 'compare_dfcolumn' 'operator_of_compare' 'value_of_compare'
df_subset = df[operator_of_compare(df[compare_dfcolumn], value_of_compare)]
df_subset = df_subset.drop(columns = [datatype_var])
# Prints counts by task and compare_dfcolumn for subset df
#print("\nCounts by Task & " + compare_dfcolumn + ":\n", df_subset.groupby(['Task', compare_dfcolumn]).size().reset_index(name="Counts") )
# Sets all datatype to value for training for df_subset
df_subset.loc[:, datatype_var] = train_value
# Performs stratified random sample of subset dataframe to create new df with subset values
df_sampled = df_subset.groupby(['Task', compare_dfcolumn], group_keys=False).apply(lambda x: x.sample(min(len(x), sampling_number), random_state = sampling_seed))
df_sampled = df_sampled.drop(columns = [datatype_var])
# Sets all datatype to value for test_value for df_sampled
df_sampled.loc[:, datatype_var] = test_value
# Prints counts by compare_dfcolumn for selected sample
#print("\nCounts by "+ compare_dfcolumn + ":\n", df_sampled.groupby([compare_dfcolumn]).size().reset_index(name="Counts") )
#print("\nSampled DF:\n",df_sampled)
# Labels all datatype_var column as train_value which will be overwritten to
# test_value in next for loop for all test cases chosen with stratified sample
for index in df_sampled.index:
# Labels all datatype_var columns with test_value for straified test sample
df_subset.loc[index, datatype_var] = test_value
#print("\nSubset DF:\n",df_subset)
# Adds test_value and train_value for all relevant data in main dataframe
for index in df_subset.index:
# Labels all datatype_var columns in df with train_value/test_value based upon
# stratified test sample and subset of df
df.loc[index, datatype_var] = df_subset.loc[index, datatype_var]
# returns nothing because dataframe df already altered
def train_test_dataframe(clean_df, random_seed=100):
new_df = clean_df.copy()
# Initialize datatype as 0 initially for all records - after function 0 will remain only for original wiki answers
new_df.loc[:,'Datatype'] = 0
# Creates test & training datatypes for plagiarized answers (1,2,3)
create_datatype(new_df, 1, 2, 'Datatype', 'Category', operator.gt, 0, 1, random_seed)
# Creates test & training datatypes for NON-plagiarized answers (0)
create_datatype(new_df, 1, 2, 'Datatype', 'Category', operator.eq, 0, 2, random_seed)
# creating a dictionary of categorical:numerical mappings for plagiarsm categories
mapping = {0:'orig', 1:'train', 2:'test'}
# traversing through dataframe and replacing categorical data
new_df.Datatype = [mapping[item] for item in new_df.Datatype]
return new_df
# helper function for pre-processing text given a file
def process_file(file):
# put text in all lower case letters
all_text = file.read().lower()
# remove all non-alphanumeric chars
all_text = re.sub(r"[^a-zA-Z0-9]", " ", all_text)
# remove newlines/tabs, etc. so it's easier to match phrases, later
all_text = re.sub(r"\t", " ", all_text)
all_text = re.sub(r"\n", " ", all_text)
all_text = re.sub(" ", " ", all_text)
all_text = re.sub(" ", " ", all_text)
return all_text
def create_text_column(df, file_directory='data/'):
'''Reads in the files, listed in a df and returns that df with an additional column, `Text`.
:param df: A dataframe of file information including a column for `File`
:param file_directory: the main directory where files are stored
:return: A dataframe with processed text '''
# create copy to modify
text_df = df.copy()
# store processed text
text = []
# for each file (row) in the df, read in the file
for row_i in df.index:
filename = df.iloc[row_i]['File']
#print(filename)
file_path = file_directory + filename
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
# standardize text using helper function
file_text = process_file(file)
# append processed text to list
text.append(file_text)
# add column to the copied dataframe
text_df['Text'] = text
return text_df
def create_ngram_array(n,a_text, s_text):
with open("data/"+a_text) as f:
a_text_content = f.read().rstrip('\n')
with open("data/"+s_text) as f:
s_text_content = f.read().rstrip('\n')
counts= CountVectorizer(analyzer='word', ngram_range=(n,n))
counts_vocabulary = counts.fit([a_text_content, s_text_content]).vocabulary_
ngrams = counts.fit_transform([a_text_content, s_text_content])
ngram_array = ngrams.toarray()
return ngram_array
def original_file(df,answer_filename):
if answer_filename.find('a.txt'):
file = 'orig_taska.txt'
if answer_filename.find('b.txt'):
file = 'orig_taskb.txt'
if answer_filename.find('c.txt'):
file = 'orig_taskc.txt'
if answer_filename.find('d.txt'):
file = 'orig_taskd.txt'
if answer_filename.find('e.txt'):
file = 'orig_taske.txt'
return file