Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
vinayakumarr authored Jan 23, 2018
1 parent 37d79df commit dc86909
Show file tree
Hide file tree
Showing 31 changed files with 2,458 additions and 0 deletions.
74 changes: 74 additions & 0 deletions CDMC-2016/android malware classification/crossval1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import keras.preprocessing.text
import numpy as np
import pandas as pd
np.random.seed(1337) # for reproducibility
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from keras.utils.np_utils import to_categorical
from sklearn.cross_validation import train_test_split
from keras.layers import Dropout
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier

print("Loading")

traindata = pd.read_csv('space/CDMC2016_AndroidLabel.Train.csv', header=None)


x = traindata.iloc[:,1]
y = traindata.iloc[:,0]


tk = keras.preprocessing.text.Tokenizer(nb_words=5000, filters=keras.preprocessing.text.base_filter(), lower=True, split=",")
tk.fit_on_texts(x)
X_train = tk.texts_to_sequences(x)


X_train=np.array(X_train)
y_train = np.array(y)


batch_size = 64
max_len = 500
print "max_len ", max_len
print('Pad sequences (samples x time)')

X_train = sequence.pad_sequences(X_train, maxlen=max_len)

max_features = 5000
model = Sequential()
print('Build model...')
embedding_vecor_length = 32

def create_model():
model = Sequential()
model.add(Embedding(max_features, embedding_vecor_length, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
print(model.summary())
return model


# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

model = KerasClassifier(build_fn=create_model, nb_epoch=20, batch_size=32)

# evaluate using 10-fold cross validation
kfold = StratifiedKFold(y=y_train, n_folds=10, shuffle=True, random_state=seed)
results = cross_val_score(model, X_train, y_train, cv=kfold)
print(results.mean())


83 changes: 83 additions & 0 deletions CDMC-2016/android malware classification/crossval2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import keras.preprocessing.text
import numpy as np
import pandas as pd
np.random.seed(1337) # for reproducibility
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from keras.utils.np_utils import to_categorical
from sklearn.cross_validation import train_test_split
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from theano.tensor.shared_randomstreams import RandomStreams
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier


# fix random seed for reproducibility
np.random.seed(7)
srng = RandomStreams(7)

print("Loading")

traindata = pd.read_csv('space/CDMC2016_AndroidLabel.Train.csv', header=None)


x = traindata.iloc[:,1]
y = traindata.iloc[:,0]


tk = keras.preprocessing.text.Tokenizer(nb_words=5000, filters=keras.preprocessing.text.base_filter(), lower=True, split=",")
tk.fit_on_texts(x)
X_train = tk.texts_to_sequences(x)


X_train=np.array(X_train)
y_train = np.array(y)


batch_size = 64
max_len = 500
print "max_len ", max_len
print('Pad sequences (samples x time)')

X_train = sequence.pad_sequences(X_train, maxlen=max_len)



max_features = 5000
embedding_vecor_length = 32


def create_model():
model = Sequential()
model.add(Embedding(max_features, embedding_vecor_length, input_length=max_len))
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
print(model.summary())
return model

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

model = KerasClassifier(build_fn=create_model, nb_epoch=20, batch_size=32)

# evaluate using 10-fold cross validation
kfold = StratifiedKFold(y=y_train, n_folds=10, shuffle=True, random_state=seed)
results = cross_val_score(model, X_train, y_train, cv=kfold)
print(results.mean())

70 changes: 70 additions & 0 deletions CDMC-2016/android malware classification/ker1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import keras.preprocessing.text
import numpy as np
import pandas as pd
np.random.seed(1337) # for reproducibility
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
from sklearn import metrics
from sklearn.metrics import roc_auc_score


print("Loading")

#traindata = pd.read_csv('CDMC2016_AndroidLabel.Train.csv', header=None)
#testdata = pd.read_csv('CDMC2016_AndroidPermissions.Test.csv', header=None)

traindata = pd.read_csv('space/CDMC2016_AndroidLabel.Train.csv', header=None)
testdata = pd.read_csv('space/CDMC2016_AndroidPermissions.Test.csv', header=None)

x = traindata.iloc[:,1]
y = traindata.iloc[:,0]
t = testdata.iloc[:,0]


tk = keras.preprocessing.text.Tokenizer(nb_words=500,filters=keras.preprocessing.text.base_filter(), lower=True, split=" ")
tk.fit_on_texts(x)

x = tk.texts_to_sequences(x)
print(x)
'''
tk = keras.preprocessing.text.Tokenizer(nb_words=500, filters=keras.preprocessing.text.base_filter(), lower=True, split=" ")
tk.fit_on_texts(t)
t = tk.texts_to_sequences(t)
print(t)
'''
'''
max_len = 200
print "max_len ", max_len
print('Pad sequences (samples x time)')
x = sequence.pad_sequences(x, maxlen=max_len)
t = sequence.pad_sequences(t, maxlen=max_len)
max_features = 500
model = Sequential()
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, input_length=max_len, dropout=0.1))
model.add(LSTM(128, dropout_W=0.1, dropout_U=0.1))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.fit(x, y, batch_size=32, nb_epoch=30)
score, acc = model.evaluate(x, y, batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)
y_pred = model.predict_classes(t)
np.savetxt('output.txt', y_pred, fmt='%01d')
'''
80 changes: 80 additions & 0 deletions CDMC-2016/android malware classification/ker2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import keras.preprocessing.text
import numpy as np
import pandas as pd
np.random.seed(1337) # for reproducibility
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from keras.utils.np_utils import to_categorical
from sklearn.cross_validation import train_test_split
from keras.layers import Dropout
from keras import callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger, ReduceLROnPlateau



print("Loading")

traindata = pd.read_csv('space/CDMC2016_AndroidLabel.Train.csv', header=None)


x = traindata.iloc[:,1]
y = traindata.iloc[:,0]


tk = keras.preprocessing.text.Tokenizer(nb_words=5000, filters=keras.preprocessing.text.base_filter(), lower=True, split=",")
tk.fit_on_texts(x)
X_train = tk.texts_to_sequences(x)




X_train=np.array(X_train)



y_train = np.array(y)


batch_size = 64
max_len = 500
print "max_len ", max_len
print('Pad sequences (samples x time)')

X_train = sequence.pad_sequences(X_train, maxlen=max_len)


#y_train= to_categorical(y_train)
#y_test = to_categorical(y_test)


max_features = 5000
model = Sequential()
print('Build model...')
embedding_vecor_length = 32

model = Sequential()
model.add(Embedding(max_features, embedding_vecor_length, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
print(model.summary())
checkpointer = callbacks.ModelCheckpoint(filepath="logs/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='val_acc',mode='max')
csv_logger = CSVLogger('logs/training_set_iranalysis.csv',separator=',', append=False)

model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1000,
validation_data=(X_train, y_train), shuffle=True,callbacks=[checkpointer,csv_logger])
score, acc = model.evaluate(X_train, y_train,
batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)


77 changes: 77 additions & 0 deletions CDMC-2016/android malware classification/ker3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import keras.preprocessing.text
import numpy as np
import pandas as pd
np.random.seed(1337) # for reproducibility
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from keras.utils.np_utils import to_categorical
from sklearn.cross_validation import train_test_split
from keras.layers import Dropout

print("Loading")

traindata = pd.read_csv('space/CDMC2016_AndroidLabel.Train.csv', header=None)
testdata = pd.read_csv('train_test.csv', header=None)

x = traindata.iloc[:,1]
y = traindata.iloc[:,0]
xt = testdata.iloc[:,1]
yt = testdata.iloc[:,0]


tk = keras.preprocessing.text.Tokenizer(nb_words=5000, filters=keras.preprocessing.text.base_filter(), lower=True, split=",")
tk.fit_on_texts(x)
X_train = tk.texts_to_sequences(x)


tk = keras.preprocessing.text.Tokenizer(nb_words=5000, filters=keras.preprocessing.text.base_filter(), lower=True, split=",")
tk.fit_on_texts(xt)
X_test = tk.texts_to_sequences(xt)



X_train=np.array(X_train)
X_test=np.array(X_test)


y_train = np.array(y)
y_test = np.array(yt)

batch_size = 64
max_len = 500
print "max_len ", max_len
print('Pad sequences (samples x time)')

X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

max_features = 5000
model = Sequential()
print('Build model...')
embedding_vecor_length = 32

model = Sequential()
model.add(Embedding(max_features, embedding_vecor_length, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=50,validation_data=(X_test, y_test),shuffle=True)
score, acc = model.evaluate(X_test, y_test)

print('Test score:', score)
print('Test accuracy:', acc)
y_pred = model.predict_classes(X_test)
np.savetxt('output.txt', np.transpose([y_test,y_pred]), fmt='%01d')

Loading

0 comments on commit dc86909

Please sign in to comment.