-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocess.py
More file actions
136 lines (92 loc) ยท 6.3 KB
/
data_preprocess.py
File metadata and controls
136 lines (92 loc) ยท 6.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
import time
import api
class Preprocess():
def __init__(self, log_use_features):
self.log_use_features = log_use_features
self.fe_data = pd.DataFrame()
def map_answerlog_last_reclog(self, answerlogs, reclogs):
answerlogs["label"] = 1
answerlogs = answerlogs.rename(columns={'timestamp':'answer_timestamp'})
_last_reclog = pd.DataFrame(columns=reclogs.columns)
for row, value in answerlogs.iterrows():
temp = reclogs[(reclogs["user"]==value["user"]) & (reclogs["timestamp"]<value["answer_timestamp"])].iloc[-1]
_last_reclog = _last_reclog.append(temp, ignore_index=True)
_last_reclog.rename(columns={'user':'rec_user_id'},inplace=True)
fe_concat = pd.concat([answerlogs, _last_reclog], axis=1)
fe_concat= fe_concat.drop(columns=['answer_timestamp'])
fe_concat.rename(columns={'timestamp':'rec_timestamp'},inplace=True)
return fe_concat
def make_negative_answerlog(self, fe_concat):
fe_data = fe_concat[self.log_use_features]
rec_answer_id = fe_data.groupby('rec_log_id')['answer'].apply(list)
last_rec_log = 0
for row, value in tqdm(fe_concat.iterrows()):
watched = rec_answer_id[value["rec_log_id"]]
if value["rec_log_id"] == last_rec_log: #๊ฐ์ ์ถ์ฒ๋ฒํผ ๋ก๊ทธ๋ก๋ถํฐ ์ค๋ฉด pass
continue
if value["impressions"]: #impressions์ด ์กด์ฌํ ๋
impressions = ast.literal_eval(value["impressions"])
for i in range(20):
if impressions[i] in watched: #๋
ธ์ถ๋ answer_id ์ค ํด๋ฆญ๋ answer_id๋ pass
continue
if i in [18,19]:
rectype = '7000006'
else:
rectype = '700000' + str((i//4) +1)
temp = [value["rec_timestamp"],impressions[i], rectype, value["user"],0, value["rec_log_id"], value["question_from_user"],value["company"], value["job_small"], value["question_type"]]
fe_data = fe_data.append(pd.Series(temp, index=fe_data.columns),ignore_index=True)
last_rec_log = value["rec_log_id"]
fe_data.rename(columns={"rec_timestamp":"timestamp", "answer":"answer","rectype":"rectype","user":"user",
"company":"rec_company", "job_small":"rec_job_small","question_type":"rec_question_type"}, inplace=True)
self.fe_data = fe_data
def merge_side_information_user(self):
#### user data merge ####
user_data = api.load_user()
use_user_data = user_data[["id", "career_type", "interesting_job_large","major_small"]]
use_user_data.rename(columns = {'major_small' : 'user_major_small', "id":"user","interesting_job_large":"user_job_large", "career_type":"user_career_type"}, inplace = True)
self.fe_data = self.fe_data.merge(use_user_data,how="left", on="user")
def merge_side_information_answer(self):
#### answer merge ###
answers = api.load_answer()
documents = api.load_document()
answer_with_doc = answers.merge(documents[['document_id', 'company', 'job_small', 'major_small', "pro_rating"]], on='document_id')
answer_with_doc.columns = ['answer', 'user_good_cnt', 'user_bad_cnt', 'answer_pro_good_cnt', 'answer_pro_bad_cnt',
'doc_view', 'user_view', 'document', 'user_impression_cnt', 'answer_question_types','doc_company_id', 'doc_job_small_id', 'doc_major_small_id', "doc_pro_rating"]
self.fe_data = self.fe_data.merge(answer_with_doc,how="left", on="answer")
self.fe_data.rename(columns={"doc_major_small_id":"doc_major_small", "doc_major_large_id":"doc_major_large",
"doc_company_id":"doc_company", "doc_job_small_id":"doc_job_small"}, inplace=True)
def merge_side_information_job(self):
#### job large merge ###
jobsmall = api.load_job()
rec_jobsmall = jobsmall[["job_small_id", "job_large"]]
rec_jobsmall.rename(columns={"job_small_id":"rec_job_small", "job_large":"rec_job_large"}, inplace=True)
doc_jobsmall = jobsmall[["job_small_id", "job_large"]]
doc_jobsmall.rename(columns={"job_small_id":"doc_job_small", "job_large":"doc_job_large"}, inplace=True)
self.fe_data = self.fe_data.merge(rec_jobsmall, how="left", on="rec_job_small")
self.fe_data = self.fe_data.merge(doc_jobsmall, how="left", on="doc_job_small")
def _feature_engineering_data(self):
#### ์ ๋ ฌ ๋ฐ rename ###
self.fe_data = self.fe_data.sort_values(by=['rec_log_id'])
self.fe_data = self.fe_data[["user","user_career_type","user_job_large","user_major_small","rectype","rec_company","rec_job_large","rec_job_small","rec_question_type",
"answer","answer_pro_good_cnt","answer_pro_bad_cnt","answer_question_types","document","doc_view","doc_company","doc_job_large","doc_job_small","doc_major_small", "doc_pro_rating","label"]]
def make_coin_feature(self):
#### coin ์ถ๊ฐ ####
self.fe_data.loc[self.fe_data["rec_company"]==self.fe_data["doc_company"], "coin_company"] = 1
self.fe_data.loc[self.fe_data["rec_company"]!=self.fe_data["doc_company"], "coin_company"] = 0
self.fe_data.loc[self.fe_data["rec_job_large"]==self.fe_data["doc_job_large"], "coin_joblarge"] = 1
self.fe_data.loc[self.fe_data["rec_job_large"]!=self.fe_data["doc_job_large"], "coin_joblarge"] = 0
self.fe_data.loc[self.fe_data["rec_job_small"]==self.fe_data["doc_job_small"], "coin_jobsmall"] = 1
self.fe_data.loc[self.fe_data["rec_job_small"]!=self.fe_data["doc_job_small"], "coin_jobsmall"] = 0
### ์ฌ์ฉ์๊ฐ ์ ํํ ์ง๋ฌธ์ด answer question type์ ํฌํจ๋๋์ง ####
for i in range(len(self.fe_data)):
answer = self.fe_data.loc[i, "answer_question_types"].split(",")
if str(self.fe_data.loc[i, 'rec_question_type']) in answer:
self.fe_data.loc[i, 'coin_question_type'] = 1
else:
self.fe_data.loc[i, 'coin_question_type'] = 0
def get_fe_data(self):
return self.fe_data