Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid DB import bugs by trimming strings at import transform #157

Merged
merged 2 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## 6.20.11 - Nov 22, 2024

* Use transformers to trim incoming strings at import that are too long for DB columns:
* Bill: document note, version note
* Event: media note

## 6.20.10 - Nov 7, 2024

* Add additional log info re: archiving scrape files to cloud storage
Expand Down
24 changes: 14 additions & 10 deletions openstates/importers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,16 +532,20 @@ def apply_transformers(
if transformers is None:
transformers = self.cached_transformers

for key, key_transformers in transformers.items():
if key not in data:
continue
if isinstance(key_transformers, list):
for transformer in key_transformers:
data[key] = transformer(data[key])
elif isinstance(key_transformers, dict):
self.apply_transformers(data[key], key_transformers)
else:
data[key] = key_transformers(data[key])
if isinstance(data, list):
for data_item in data:
self.apply_transformers(data_item, transformers)
else:
for key, key_transformers in transformers.items():
if key not in data:
continue
if isinstance(key_transformers, list):
for transformer in key_transformers:
data[key] = transformer(data[key])
elif isinstance(key_transformers, dict):
self.apply_transformers(data[key], key_transformers)
else:
data[key] = key_transformers(data[key])

return data

Expand Down
11 changes: 10 additions & 1 deletion openstates/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,16 @@
CACHE_DIR = os.path.join(os.getcwd(), "_cache")
SCRAPED_DATA_DIR = os.path.join(os.getcwd(), "_data")

IMPORT_TRANSFORMERS = {"bill": {"identifier": transformers.fix_bill_id}}
IMPORT_TRANSFORMERS = {
"bill": {
"identifier": transformers.fix_bill_id,
"documents": {"note": transformers.truncate_300}, # TODO remove when db migration done
"versions": {"note": transformers.truncate_300}, # TODO remove when db migration done
},
"event": {
"media": {"note": transformers.truncate_300}, # TODO remove when db migration done
}
}

# Django settings
LOGGING = {
Expand Down
4 changes: 4 additions & 0 deletions openstates/utils/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ def fix_bill_id(bill_id: str) -> str:

def collapse_whitespace(value: str) -> str:
return _whitespace_re.sub(" ", value)


def truncate_300(value: str) -> str:
return value[:300]
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "openstates"
version = "6.20.10"
version = "6.20.11"
description = "core infrastructure for the openstates project"
authors = ["James Turk <[email protected]>"]
license = "MIT"
Expand Down
Loading