Skip to content

Commit

Permalink
Optimise CSV export by saving columns (#2762)
Browse files Browse the repository at this point in the history
* optimise CSV export by saving columns

save column build and update build after an Instance is saved. The previous optimisation technique saved the maximum number for each repeat group. This resulted in extra columns for submissions with less repeats than the maximum

* suppress lint warning protected-access

* refactor code

* update export registe after for replacement

* create default ordered columns when registering xform repeats

* fix lint warning inconsistent-return-statements

* fix lint warning no-else-return

* wait for record to be persisted before calling async task

* update doc strings

* rename symbols

* rename symbol

* increase queryset chunk_size

* add support for merged select multiples in export columns register

* refactor code

* refactor code

* refactor code

* suppress lint warning

* refactor code

* rename symbol
  • Loading branch information
kelvin-muchiri authored Jan 28, 2025
1 parent df12f7e commit f911be9
Show file tree
Hide file tree
Showing 16 changed files with 882 additions and 443 deletions.
8 changes: 4 additions & 4 deletions onadata/apps/api/tests/viewsets/test_abstract_viewset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from onadata.apps.main.models import MetaData, UserProfile
from onadata.apps.main.tests.test_base import TestBase
from onadata.apps.viewer.models import DataDictionary
from onadata.apps.viewer.models.data_dictionary import create_export_repeat_register
from onadata.apps.viewer.models.data_dictionary import create_or_update_export_register
from onadata.libs.serializers.project_serializer import ProjectSerializer
from onadata.libs.utils.common_tools import merge_dicts

Expand Down Expand Up @@ -139,15 +139,15 @@ def setUp(self):
self.maxDiff = None
# Disable signals
post_save.disconnect(
sender=DataDictionary, dispatch_uid="create_export_repeat_register"
sender=DataDictionary, dispatch_uid="create_or_update_export_register"
)

def tearDown(self):
# Enable signals
post_save.connect(
sender=DataDictionary,
dispatch_uid="create_export_repeat_register",
receiver=create_export_repeat_register,
dispatch_uid="create_or_update_export_register",
receiver=create_or_update_export_register,
)

TestCase.tearDown(self)
Expand Down
8 changes: 4 additions & 4 deletions onadata/apps/logger/models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,12 +879,12 @@ def permanently_delete_attachments(sender, instance=None, created=False, **kwarg


@use_master
def register_export_repeats(sender, instance, created=False, **kwargs):
def register_instance_repeat_columns(sender, instance, created=False, **kwargs):
# Avoid cyclic dependency errors
logger_tasks = importlib.import_module("onadata.apps.logger.tasks")

transaction.on_commit(
lambda: logger_tasks.register_instance_export_repeats_async.delay(instance.pk)
lambda: logger_tasks.register_instance_repeat_columns_async.delay(instance.pk)
)


Expand All @@ -905,9 +905,9 @@ def register_export_repeats(sender, instance, created=False, **kwargs):
)

post_save.connect(
register_export_repeats,
register_instance_repeat_columns,
sender=Instance,
dispatch_uid="register_export_repeats",
dispatch_uid="register_instance_repeat_columns",
)


Expand Down
23 changes: 13 additions & 10 deletions onadata/apps/logger/models/xform.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from pyxform import SurveyElementBuilder, constants, create_survey_element_from_dict
from pyxform.question import Question
from pyxform.section import RepeatingSection
from pyxform.xform2json import create_survey_element_from_xml
from six import iteritems
from taggit.managers import TaggableManager

Expand Down Expand Up @@ -374,18 +373,22 @@ def get_unique_id_string(self, id_string, count=0):

return id_string

def _get_survey(self):
try:
builder = SurveyElementBuilder()
if isinstance(self.json, str):
return builder.create_survey_element_from_json(self.json)
if isinstance(self.json, dict):
return builder.create_survey_element_from_dict(self.json)
except ValueError:
pass

return bytes(bytearray(self.xml, encoding="utf-8"))

def get_survey(self):
"""Returns an XML XForm survey object."""
if not hasattr(self, "_survey"):
try:
builder = SurveyElementBuilder()
if isinstance(self.json, str):
self._survey = builder.create_survey_element_from_json(self.json)
if isinstance(self.json, dict):
self._survey = builder.create_survey_element_from_dict(self.json)
except ValueError:
xml = bytes(bytearray(self.xml, encoding="utf-8"))
self._survey = create_survey_element_from_xml(xml)
self._survey = self._get_survey()
return self._survey

survey = property(get_survey)
Expand Down
16 changes: 8 additions & 8 deletions onadata/apps/logger/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
commit_cached_elist_num_entities,
dec_elist_num_entities,
inc_elist_num_entities,
register_instance_export_repeats,
register_xform_export_repeats,
reconstruct_xform_export_register,
register_instance_repeat_columns,
soft_delete_entities_bulk,
)
from onadata.libs.utils.project_utils import set_project_perms_to_object
Expand Down Expand Up @@ -116,8 +116,8 @@ def dec_elist_num_entities_async(elist_pk: int) -> None:


@app.task(retry_backoff=3, autoretry_for=(DatabaseError, ConnectionError))
def register_instance_export_repeats_async(instance_pk: int) -> None:
"""Register export repeats asynchronously
def register_instance_repeat_columns_async(instance_pk: int) -> None:
"""Register Instance repeat columns asynchronously
:param instance_pk: Primary key for Instance
"""
Expand All @@ -128,12 +128,12 @@ def register_instance_export_repeats_async(instance_pk: int) -> None:
logger.exception(exc)

else:
register_instance_export_repeats(instance)
register_instance_repeat_columns(instance)


@app.task(retry_backoff=3, autoretry_for=(DatabaseError, ConnectionError))
def register_xform_export_repeats_async(xform_id: int) -> None:
"""Register export repeats for an XForm asynchronously
def reconstruct_xform_export_register_async(xform_id: int) -> None:
"""Register a XForm's Instances export columns asynchronously
:param xform_id: Primary key for XForm
"""
Expand All @@ -144,4 +144,4 @@ def register_xform_export_repeats_async(xform_id: int) -> None:
logger.exception(exc)

else:
register_xform_export_repeats(xform)
reconstruct_xform_export_register(xform)
108 changes: 100 additions & 8 deletions onadata/apps/logger/tests/models/test_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
Test Instance model.
"""

import json
import os
from collections import OrderedDict
from datetime import datetime, timedelta
from unittest.mock import Mock, patch

from django.contrib.contenttypes.models import ContentType
from django.http.request import HttpRequest
from django.test import override_settings
from django.utils.timezone import utc
Expand Down Expand Up @@ -1211,8 +1214,8 @@ def test_xml_entity_node_missing(self):

self.assertEqual(Entity.objects.count(), 0)

def test_register_repeats(self):
"""Repeats are registered correctly"""
def test_repeat_columns_registered(self):
"""Instance repeat columns are added to export columns register"""
project = get_user_default_project(self.user)
md = """
| survey |
Expand All @@ -1229,6 +1232,34 @@ def test_register_repeats(self):
| | Births | births | |
"""
xform = self._publish_markdown(md, self.user, project)
register = MetaData.objects.get(
data_type="export_columns_register",
object_id=xform.pk,
content_type=ContentType.objects.get_for_model(xform),
)
# Default export columns are correctly registered
merged_multiples_columns = json.loads(
register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict
)
split_multiples_columns = json.loads(
register.extra_data["split_multiples"], object_pairs_hook=OrderedDict
)
expected_columns = OrderedDict(
[
(
"hospital_repeat",
[],
),
(
"hospital_repeat/child_repeat",
[],
),
("meta/instanceID", None),
]
)
self.assertEqual(merged_multiples_columns, expected_columns)
self.assertEqual(split_multiples_columns, expected_columns)

xml = (
'<?xml version="1.0" encoding="UTF-8"?>'
'<data xmlns:jr="http://openrosa.org/javarosa" xmlns:orx='
Expand Down Expand Up @@ -1259,9 +1290,37 @@ def test_register_repeats(self):
)
# Repeats are registered on creation
instance = Instance.objects.create(xml=xml, user=self.user, xform=xform)
metadata = MetaData.objects.get(data_type="export_repeat_register")
self.assertEqual(metadata.extra_data.get("hospital_repeat"), 2)
self.assertEqual(metadata.extra_data.get("child_repeat"), 2)
register.refresh_from_db()
merged_multiples_columns = json.loads(
register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict
)
split_multiples_columns = json.loads(
register.extra_data["split_multiples"], object_pairs_hook=OrderedDict
)
expected_columns = OrderedDict(
[
(
"hospital_repeat",
["hospital_repeat[1]/hospital", "hospital_repeat[2]/hospital"],
),
(
"hospital_repeat/child_repeat",
[
"hospital_repeat[1]/child_repeat[1]/name",
"hospital_repeat[1]/child_repeat[1]/birthweight",
"hospital_repeat[1]/child_repeat[2]/name",
"hospital_repeat[1]/child_repeat[2]/birthweight",
"hospital_repeat[2]/child_repeat[1]/name",
"hospital_repeat[2]/child_repeat[1]/birthweight",
],
),
("meta/instanceID", None),
]
)

self.assertEqual(merged_multiples_columns, expected_columns)
self.assertEqual(split_multiples_columns, expected_columns)

# Repeats are registered on update
xml = (
'<?xml version="1.0" encoding="UTF-8"?>'
Expand Down Expand Up @@ -1302,6 +1361,39 @@ def test_register_repeats(self):
instance.xml = xml
instance.uuid = "51cb9e07-cfc7-413b-bc22-ee7adfa9dec4"
instance.save()
metadata.refresh_from_db()
self.assertEqual(metadata.extra_data.get("hospital_repeat"), 3)
self.assertEqual(metadata.extra_data.get("child_repeat"), 2)
register.refresh_from_db()
merged_multiples_columns = json.loads(
register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict
)
split_multiples_columns = json.loads(
register.extra_data["split_multiples"], object_pairs_hook=OrderedDict
)
expected_columns = OrderedDict(
[
(
"hospital_repeat",
[
"hospital_repeat[1]/hospital",
"hospital_repeat[2]/hospital",
"hospital_repeat[3]/hospital",
],
),
(
"hospital_repeat/child_repeat",
[
"hospital_repeat[1]/child_repeat[1]/name",
"hospital_repeat[1]/child_repeat[1]/birthweight",
"hospital_repeat[1]/child_repeat[2]/name",
"hospital_repeat[1]/child_repeat[2]/birthweight",
"hospital_repeat[2]/child_repeat[1]/name",
"hospital_repeat[2]/child_repeat[1]/birthweight",
"hospital_repeat[3]/child_repeat[1]/name",
"hospital_repeat[3]/child_repeat[1]/birthweight",
],
),
("meta/instanceID", None),
]
)

self.assertEqual(merged_multiples_columns, expected_columns)
self.assertEqual(split_multiples_columns, expected_columns)
Loading

0 comments on commit f911be9

Please sign in to comment.