-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcl_DBR.py
349 lines (323 loc) · 14.9 KB
/
cl_DBR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# -*- coding: utf-8 -*-
# External import
import xml.etree.ElementTree as ET
import pymarc
from fuzzywuzzy import fuzz
from enum import Enum
from typing import List, Tuple
# Internal imports
from cl_PODA import Processing, Processing_Data_Target, Filters, Mapped_Fields
from cl_ES import Records_Settings, Analysis_Checks
from cl_UDE import Universal_Data_Extractor
from func_string_manip import list_as_string, clean_publisher, nettoie_titre, get_year
# -------------------- Enums --------------------
class Analysis_Final_Results(Enum):
UNKNOWN = {
"eng":"Unknown final result",
"fre":"Vérification inconnue"
}
NO_CHECK = {
"eng":"No checks",
"fre":"Pas de vérification"
}
TOTAL_MATCH = {
"eng":"All checks were successful",
"fre":"Vérications complètes"
}
PARTIAL_MATCH = {
"eng":"Checks partially successful",
"fre":"Vérifications partielles"
}
NO_MATCH = {
"eng":"All checks failed",
"fre":"Vérifications KO"
}
class Other_Database_Id_In_Target(Enum):
UNKNOWN = {
"eng":"Unknown",
"fre":"Inconnu"
}
SKIPPED = {
"eng":"Skipped",
"fre":"Ignoré"
}
NO_OTHER_DB_ID = {
"eng":"No ID in the list",
"fre":"Aucun ID dans la liste"
}
THIS_ID_INCLUDED = {
"eng":"Included in list",
"fre":"Présent dans la liste"
}
ONLY_THIS_OTHER_DB_ID = {
"eng":"List includes only this ID",
"fre":"Liste comportant que cet ID"
}
THIS_ID_NOT_INCLUDED = {
"eng":"Missing from list",
"fre":"Absent de la liste"
}
# -------------------- Classes --------------------
class Database_Record(object):
"""Contains extracted data from the record.
The data property contains every mapped data for the chosen processing"""
def __init__(self, processing: Processing, record: ET.ElementTree | dict | pymarc.record.Record, fcr_processed_id:str, is_target_db: bool, settings:Records_Settings):
self.processing = processing
self.record = record
self.fcr_processed_id = fcr_processed_id
self.database = self.processing.origin_database
if is_target_db:
self.database = self.processing.target_database
self.is_target_db = is_target_db
# Temp var for easyness
marc_fields_json = settings.origin_db_marc_fields_json
if self.is_target_db:
marc_fields_json = settings.target_db_marc_fields_json
self.ude = Universal_Data_Extractor(self.record, self.database.enum_member, marc_fields_json)
self.data = {}
for data in processing.mapped_data:
if (
(processing.mapped_data[data] == Processing_Data_Target.BOTH)
or (self.is_target_db and processing.mapped_data[data] == Processing_Data_Target.TARGET)
or (not self.is_target_db and processing.mapped_data[data] == Processing_Data_Target.ORIGIN)
):
if data in self.database.filters:
filter_value = ""
if self.database.filters[data] == Filters.RCR:
filter_value = settings.rcr
elif self.database.filters[data] == Filters.ILN:
filter_value = settings.iln
elif self.database.filters[data] == Filters.FILTER1:
filter_value = settings.filter1
elif self.database.filters[data] == Filters.FILTER2:
filter_value = settings.filter2
elif self.database.filters[data] == Filters.FILTER3:
filter_value = settings.filter3
self.data[data] = self.ude.get_by_mapped_field_name(data, filter_value)
else:
self.data[data] = self.ude.get_by_mapped_field_name(data)
self.chosen_analysis = settings.chosen_analysis
self.chosen_analysis_checks = settings.chosen_analysis_checks
self.utils = self.Utils(self)
def __compare_titles(self, compared_to):
"""Compares the titles and sets their keys"""
self.compared_title_key = compared_to.utils.get_first_title_as_string()
self.title_key = self.utils.get_first_title_as_string()
self.title_ratio = fuzz.ratio(self.title_key, self.compared_title_key)
self.title_partial_ratio = fuzz.partial_ratio(self.title_key, self.compared_title_key)
self.title_token_sort_ratio = fuzz.token_sort_ratio(self.title_key, self.compared_title_key)
self.title_token_set_ratio = fuzz.token_set_ratio(self.title_key, self.compared_title_key)
def __compare_dates(self, compared_to) -> None:
"""Compares if one of the record dates matches on one of the compared record."""
self.dates_matched = False
# Merge dates lists
this_dates = []
for dates in self.data[Mapped_Fields.GENERAL_PROCESSING_DATA_DATES]:
this_dates.extend(dates)
compared_dates = []
for dates in compared_to.data[Mapped_Fields.GENERAL_PROCESSING_DATA_DATES]:
compared_dates.extend(dates)
for date in this_dates:
if date in compared_dates and date != " ": # excludes empty dates
self.dates_matched = True
return
def __compare_publishers(self, compared_to) -> None:
"""Compares every publishers in this record with every publishers in comapred record"""
self.publishers_score = -1
self.chosen_publisher = ""
self.chosen_compared_publisher = ""
# I fboth don't have results, comparison can't be done
if len(self.data[Mapped_Fields.PUBLISHERS_NAME]) == 0 or len(compared_to.data[Mapped_Fields.PUBLISHERS_NAME]) == 0:
return
for publisher in self.data[Mapped_Fields.PUBLISHERS_NAME]:
publisher = clean_publisher(publisher)
for compared_publisher in compared_to.data[Mapped_Fields.PUBLISHERS_NAME]:
compared_publisher = clean_publisher(compared_publisher)
ratio = fuzz.ratio(publisher, compared_publisher)
if ratio > self.publishers_score:
self.publishers_score = ratio
self.chosen_publisher = publisher
self.chosen_compared_publisher = compared_publisher
def __compare_other_db_id(self, compared_to):
"""Checks if this record id is in the comapred other database IDs"""
self.local_id_in_compared_record = Other_Database_Id_In_Target.UNKNOWN
self.list_of_other_db_id = self.utils.get_other_db_id()
# Other DB IDs are not extracted
if self.list_of_other_db_id == None:
self.local_id_in_compared_record = Other_Database_Id_In_Target.SKIPPED
self.nb_other_db_id = 0
return
# They were extracted
self.nb_other_db_id = len(self.list_of_other_db_id)
id = list_as_string(compared_to.data[Mapped_Fields.ID])
if self.nb_other_db_id == 0:
self.local_id_in_compared_record = Other_Database_Id_In_Target.NO_OTHER_DB_ID
elif self.nb_other_db_id == 1 and id in self.list_of_other_db_id:
self.local_id_in_compared_record = Other_Database_Id_In_Target.ONLY_THIS_OTHER_DB_ID
elif self.nb_other_db_id > 1 and id in self.list_of_other_db_id:
self.local_id_in_compared_record = Other_Database_Id_In_Target.THIS_ID_INCLUDED
elif id not in self.list_of_other_db_id:
self.local_id_in_compared_record = Other_Database_Id_In_Target.THIS_ID_NOT_INCLUDED
def __analysis_check_title(self):
self.check_title_nb_valids = 0
# for each matching score, checks if it's high enough
title_score_list = [
self.title_ratio,
self.title_partial_ratio,
self.title_token_sort_ratio,
self.title_token_set_ratio
]
for title_score in title_score_list:
if title_score >= self.chosen_analysis["TITLE_MIN_SCORE"]:
self.check_title_nb_valids += 1
self.checks[Analysis_Checks.TITLE] = (self.check_title_nb_valids >= self.chosen_analysis["NB_TITLE_OK"])
def __analysis_checks(self, check):
"""Launches the check for the provided analysis"""
# Titles
if check == Analysis_Checks.TITLE:
self.__analysis_check_title()
# Publishers
elif check == Analysis_Checks.PUBLISHER:
self.checks[Analysis_Checks.PUBLISHER] = (self.publishers_score >= self.chosen_analysis["PUBLISHER_MIN_SCORE"])
# Dates
elif check == Analysis_Checks.DATE:
self.checks[Analysis_Checks.DATE] = self.dates_matched
def __finalize_analysis(self):
"""Summarizes all checks"""
self.total_checks = Analysis_Final_Results.UNKNOWN
self.passed_check_nb = 0
self.checks = {}
for check in Analysis_Checks:
self.checks[check] = None
if len(self.chosen_analysis_checks) == 0:
self.total_checks = Analysis_Final_Results.NO_CHECK
else:
for check in self.chosen_analysis_checks:
self.__analysis_checks(check)
if self.checks[check] == True:
self.passed_check_nb += 1
if self.passed_check_nb == len(self.chosen_analysis_checks):
self.total_checks = Analysis_Final_Results.TOTAL_MATCH
elif self.passed_check_nb > 0:
self.total_checks = Analysis_Final_Results.PARTIAL_MATCH
else:
self.total_checks = Analysis_Final_Results.NO_MATCH
def compare_to(self, compared_to):
"""Execute the analysis processs
Takes as argument:
- compared_to {Database_Record instance} : the record from origin database"""
self.__compare_titles(compared_to)
self.__compare_dates(compared_to)
self.__compare_publishers(compared_to)
self.__compare_other_db_id(compared_to)
self.__finalize_analysis()
def data_to_json(self) -> dict:
"""Returns the data for a JSOn export, using Mapped_Fields names as keys"""
out = {}
for data in self.data:
out[data.name] = self.data[data]
return out
def analysis_to_json(self) -> dict:
"""Returns the analysis data as a dict for a JSON export"""
if type(self.total_checks) == Analysis_Final_Results:
return {
"title":{
"title_ratio":self.title_ratio,
"title_partial_ratio":self.title_partial_ratio,
"title_token_sort_ratio":self.title_token_sort_ratio,
"title_token_set_ratio":self.title_token_set_ratio,
},
"dates":self.dates_matched,
"publishers":{
"score":self.publishers_score,
"target_db":self.chosen_publisher,
"origin_db":self.chosen_compared_publisher
},
"other_ids":{
"nb":self.nb_other_db_id,
"result":self.local_id_in_compared_record.name
},
"global":{
"result":self.total_checks.name,
"nb_succesful_checks":self.passed_check_nb,
"title":self.checks[Analysis_Checks.TITLE],
"publisher":self.checks[Analysis_Checks.PUBLISHER],
"date":self.checks[Analysis_Checks.DATE]
}
}
else:
return None
# --- Utils methods for other classes / functions ---
class Utils:
def __init__(self, parent) -> None:
self.parent = parent
self.data: dict = self.parent.data
def get_id(self) -> str:
"""Returns the record ID as a string"""
if not Mapped_Fields.ID in self.data:
return ""
return list_as_string(self.data[Mapped_Fields.ID])
def get_first_title_as_string(self) -> str:
"""Returns the first title cleaned up as a string"""
if not Mapped_Fields.TITLE in self.data:
return ""
if len(self.data[Mapped_Fields.TITLE]) < 1:
return ""
return nettoie_titre(list_as_string(self.data[Mapped_Fields.TITLE][0]))
def get_titles_as_string(self) -> str:
"""Returns all titles cleaned up as a str"""
if not Mapped_Fields.TITLE in self.data:
return ""
return nettoie_titre(list_as_string(self.data[Mapped_Fields.TITLE]))
def get_authors_as_string(self) -> str:
"""Returns all authors cleaned up as a str"""
if not Mapped_Fields.AUTHORS in self.data:
return ""
return nettoie_titre(list_as_string(self.data[Mapped_Fields.AUTHORS]))
def get_all_publishers_as_string(self) -> str:
"""Returns all authors cleaned up as a str"""
if not Mapped_Fields.PUBLISHERS_NAME in self.data:
return ""
return clean_publisher(list_as_string(self.data[Mapped_Fields.PUBLISHERS_NAME]))
def get_all_publication_dates(self) -> Tuple[List[int], int, int]:
"""Returns a tuple :
- all publication dates as a list of int
- the oldest date as a int (None if no date)
- the newest date as a int (None if no date)"""
dates = []
if not Mapped_Fields.PUBLICATION_DATES in self.data:
return dates, None, None
for date_str in self.data[Mapped_Fields.PUBLICATION_DATES]:
dates += get_year(date_str)
# Intifies
for date in dates:
date = int(date)
if dates == []:
return dates, None, None
return dates, min(dates), max(dates)
def get_first_ean_as_string(self) -> str:
"""Returns the first EAN as a str"""
ean = ""
if not Mapped_Fields.EAN in self.data:
return ean
for val in self.data[Mapped_Fields.EAN]:
if type(val) == str and val != "":
ean = val
break
return ean
def get_first_isbn_as_string(self) -> str:
"""Returns the first ISBN as a str"""
isbn = ""
if not Mapped_Fields.ISBN in self.data:
return isbn
for val in self.data[Mapped_Fields.ISBN]:
if type(val) == str and val != "":
isbn = val
break
return isbn
def get_other_db_id(self) -> List[str]|None:
"""Returns the other DB IDs as a list of str.
Returns None if data was not extracted"""
if not Mapped_Fields.OTHER_DB_ID in self.data:
return None
return self.data[Mapped_Fields.OTHER_DB_ID]