Skip to content

Commit

Permalink
v1.17.1 (AR500 fix cleaning func)
Browse files Browse the repository at this point in the history
  • Loading branch information
Alban Peyrat (Archi) committed Mar 14, 2024
1 parent 6d573de commit 82d268d
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 3 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ _Some previous changes will be added_

## [Unreleased]

## [1.17.1] - 2024-03-14

### Fixed

* Added `UN` to the lsit of empty words deleted
* Added `+` to the list of deleted noise
* Logs now properly write the target database value instead of the origin database twice

## [1.17.0] - 2024-02-28

### Changed
Expand Down
4 changes: 2 additions & 2 deletions fcr_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def prep_string(_str:str, _noise = True, _multiplespaces = True) -> str:
"""
# remove noise (punctuation) if asked (by default yes)
if _noise:
_str = re.sub(r"\.|\,|\?|\!|\;|\/|\:|\=|\[|\]|\'|\-|\(|\)|\||\"|\<|\>", " ", _str, flags=re.IGNORECASE)
_str = re.sub(r"\.|\,|\?|\!|\;|\/|\:|\=|\[|\]|\'|\-|\(|\)|\||\"|\<|\>|\+", " ", _str, flags=re.IGNORECASE)
# replace multiple spaces by ine in string if requested (default yes)
if _multiplespaces:
_str = re.sub("\s+", " ", _str).strip()
Expand Down Expand Up @@ -114,7 +114,7 @@ def delete_CBS_boolean_operators(txt:str) -> str:

def delete_Sudoc_empty_words(txt:str) -> str:
"""Deletes all Sudoc empty keywords (index TOUT) to simplify the query"""
txt = re.sub(r"\b(A|BIS|DI|IL|OF|THE|AB|BY|DIE|IM|ON|THEIR|ABOUT|C|DONT|IMPR|OU|THIS|ACCORDING|CE|DR|IN|OVER|TO|ACROSS|CETTE|DU|INTO|P|UEBER|AD|CEUX|DURANT|E|PAR|UM|AGAINST|CHEZ|DURANTE|ITS|PER|UND|AINSI|CO|DURCH|J|PLUS|UNDER|AL|COMME|DURING|L|POR|UNE|ALL|COMO|E|LA|POUR|UNLESS|ALLA|CUM|ED|LAS|QU|UNTER|ALLE|D|EIN|LE|QUAE|UPON|ALS|DAL|EINE|LES|QUE|VOM|ALSO|DALL|EINEM|LEUR|R|VON|ALTRE|DALLA|EINER|LEURS|S|VOR|AM|DANS|EINES|LO|SANS|VOS|AMONG|DAS|EL|LOS|SE|VOTRE|AN|DE|EN|M|SELON|VOUS|AND|DEGLI|ES|MES|SES|W|ASI|DEL|ET|MIT|SIC|WAS|AT|DELL|F|N|SINCE|WE|ATQUE|DELLA|FOR|NACH|SIVE|WHITCH|AU|DELLE|FROM|NE|SN|WITH|AUF|DELLO|FUER|NEAR|SO|Y|AUPRES|DEM|G|NEL|SOME|ZU|AUS|DEN|GLI|NO|SOUS|ZUR|AUSSI|DEPUIS|H|NOS|ST|AUX|DER|HIS|NOTRE|SUL|AVEC|DEREN|I|NOUS|SUR|B|DES|IHRE|O|TE|BEI|DESDE|IHRER|ODER|THAT)\b", "", txt, flags=re.IGNORECASE)
txt = re.sub(r"\b(A|BIS|DI|IL|OF|THE|AB|BY|DIE|IM|ON|THEIR|ABOUT|C|DONT|IMPR|OU|THIS|ACCORDING|CE|DR|IN|OVER|TO|ACROSS|CETTE|DU|INTO|P|UEBER|AD|CEUX|DURANT|E|PAR|UM|AGAINST|CHEZ|DURANTE|ITS|PER|UND|AINSI|CO|DURCH|J|PLUS|UNDER|AL|COMME|DURING|L|POR|UNE|ALL|COMO|E|LA|POUR|UNLESS|ALLA|CUM|ED|LAS|QU|UNTER|ALLE|D|EIN|LE|QUAE|UPON|ALS|DAL|EINE|LES|QUE|VOM|ALSO|DALL|EINEM|LEUR|R|VON|ALTRE|DALLA|EINER|LEURS|S|VOR|AM|DANS|EINES|LO|SANS|VOS|AMONG|DAS|EL|LOS|SE|VOTRE|AN|DE|EN|M|SELON|VOUS|AND|DEGLI|ES|MES|SES|W|ASI|DEL|ET|MIT|SIC|WAS|AT|DELL|F|N|SINCE|WE|ATQUE|DELLA|FOR|NACH|SIVE|WHITCH|AU|DELLE|FROM|NE|SN|WITH|AUF|DELLO|FUER|NEAR|SO|Y|AUPRES|DEM|G|NEL|SOME|ZU|AUS|DEN|GLI|NO|SOUS|ZUR|AUSSI|DEPUIS|H|NOS|ST|AUX|DER|HIS|NOTRE|SUL|AVEC|DEREN|I|NOUS|SUR|B|DES|IHRE|O|TE|BEI|DESDE|IHRER|ODER|THAT|UN)\b", "", txt, flags=re.IGNORECASE)
return re.sub(r"\s+", " ", txt)

def delete_for_sudoc(txt:str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def main(es: fcr.Execution_Settings):
if es.processing.enum_member == fcr.Processing_Names.MARC_FILE_IN_KOHA_SRU:
es.log.simple_info("Koha URL", es.target_url)
es.log.simple_info("Origin database", es.processing.origin_database.name)
es.log.simple_info("Target database", es.processing.origin_database.name)
es.log.simple_info("Target database", es.processing.target_database.name)
es.log.simple_info("Origin database mapping", es.origin_database_mapping)
es.log.simple_info("Target database mapping", es.target_database_mapping)

Expand Down

0 comments on commit 82d268d

Please sign in to comment.