From f911be9f4b48b411c27bd583ede26a2552326635 Mon Sep 17 00:00:00 2001 From: Kelvin Muchiri Date: Tue, 28 Jan 2025 11:21:57 +0300 Subject: [PATCH] Optimise CSV export by saving columns (#2762) * optimise CSV export by saving columns save column build and update build after an Instance is saved. The previous optimisation technique saved the maximum number for each repeat group. This resulted in extra columns for submissions with less repeats than the maximum * suppress lint warning protected-access * refactor code * update export registe after for replacement * create default ordered columns when registering xform repeats * fix lint warning inconsistent-return-statements * fix lint warning no-else-return * wait for record to be persisted before calling async task * update doc strings * rename symbols * rename symbol * increase queryset chunk_size * add support for merged select multiples in export columns register * refactor code * refactor code * refactor code * suppress lint warning * refactor code * rename symbol --- .../tests/viewsets/test_abstract_viewset.py | 8 +- onadata/apps/logger/models/instance.py | 8 +- onadata/apps/logger/models/xform.py | 23 +- onadata/apps/logger/tasks.py | 16 +- .../apps/logger/tests/models/test_instance.py | 108 +++++- onadata/apps/logger/tests/test_tasks.py | 48 +-- onadata/apps/main/models/meta_data.py | 40 ++- onadata/apps/main/tests/test_csv_export.py | 8 +- onadata/apps/main/tests/test_form_metadata.py | 2 +- onadata/apps/viewer/models/data_dictionary.py | 23 +- .../models/tests/test_data_dictionary.py | 103 +++++- onadata/libs/tests/utils/test_csv_builder.py | 334 +++++++++++------- onadata/libs/tests/utils/test_logger_tools.py | 319 +++++++++++++---- onadata/libs/utils/common_tags.py | 2 +- onadata/libs/utils/csv_builder.py | 145 +++----- onadata/libs/utils/logger_tools.py | 138 ++++---- 16 files changed, 882 insertions(+), 443 deletions(-) diff --git a/onadata/apps/api/tests/viewsets/test_abstract_viewset.py b/onadata/apps/api/tests/viewsets/test_abstract_viewset.py index 845992522f..7d0e499947 100644 --- a/onadata/apps/api/tests/viewsets/test_abstract_viewset.py +++ b/onadata/apps/api/tests/viewsets/test_abstract_viewset.py @@ -47,7 +47,7 @@ from onadata.apps.main.models import MetaData, UserProfile from onadata.apps.main.tests.test_base import TestBase from onadata.apps.viewer.models import DataDictionary -from onadata.apps.viewer.models.data_dictionary import create_export_repeat_register +from onadata.apps.viewer.models.data_dictionary import create_or_update_export_register from onadata.libs.serializers.project_serializer import ProjectSerializer from onadata.libs.utils.common_tools import merge_dicts @@ -139,15 +139,15 @@ def setUp(self): self.maxDiff = None # Disable signals post_save.disconnect( - sender=DataDictionary, dispatch_uid="create_export_repeat_register" + sender=DataDictionary, dispatch_uid="create_or_update_export_register" ) def tearDown(self): # Enable signals post_save.connect( sender=DataDictionary, - dispatch_uid="create_export_repeat_register", - receiver=create_export_repeat_register, + dispatch_uid="create_or_update_export_register", + receiver=create_or_update_export_register, ) TestCase.tearDown(self) diff --git a/onadata/apps/logger/models/instance.py b/onadata/apps/logger/models/instance.py index 786b01f84c..7c4b861f08 100644 --- a/onadata/apps/logger/models/instance.py +++ b/onadata/apps/logger/models/instance.py @@ -879,12 +879,12 @@ def permanently_delete_attachments(sender, instance=None, created=False, **kwarg @use_master -def register_export_repeats(sender, instance, created=False, **kwargs): +def register_instance_repeat_columns(sender, instance, created=False, **kwargs): # Avoid cyclic dependency errors logger_tasks = importlib.import_module("onadata.apps.logger.tasks") transaction.on_commit( - lambda: logger_tasks.register_instance_export_repeats_async.delay(instance.pk) + lambda: logger_tasks.register_instance_repeat_columns_async.delay(instance.pk) ) @@ -905,9 +905,9 @@ def register_export_repeats(sender, instance, created=False, **kwargs): ) post_save.connect( - register_export_repeats, + register_instance_repeat_columns, sender=Instance, - dispatch_uid="register_export_repeats", + dispatch_uid="register_instance_repeat_columns", ) diff --git a/onadata/apps/logger/models/xform.py b/onadata/apps/logger/models/xform.py index 1b0a14e3b3..059dc31911 100644 --- a/onadata/apps/logger/models/xform.py +++ b/onadata/apps/logger/models/xform.py @@ -29,7 +29,6 @@ from pyxform import SurveyElementBuilder, constants, create_survey_element_from_dict from pyxform.question import Question from pyxform.section import RepeatingSection -from pyxform.xform2json import create_survey_element_from_xml from six import iteritems from taggit.managers import TaggableManager @@ -374,18 +373,22 @@ def get_unique_id_string(self, id_string, count=0): return id_string + def _get_survey(self): + try: + builder = SurveyElementBuilder() + if isinstance(self.json, str): + return builder.create_survey_element_from_json(self.json) + if isinstance(self.json, dict): + return builder.create_survey_element_from_dict(self.json) + except ValueError: + pass + + return bytes(bytearray(self.xml, encoding="utf-8")) + def get_survey(self): """Returns an XML XForm survey object.""" if not hasattr(self, "_survey"): - try: - builder = SurveyElementBuilder() - if isinstance(self.json, str): - self._survey = builder.create_survey_element_from_json(self.json) - if isinstance(self.json, dict): - self._survey = builder.create_survey_element_from_dict(self.json) - except ValueError: - xml = bytes(bytearray(self.xml, encoding="utf-8")) - self._survey = create_survey_element_from_xml(xml) + self._survey = self._get_survey() return self._survey survey = property(get_survey) diff --git a/onadata/apps/logger/tasks.py b/onadata/apps/logger/tasks.py index 233ebc6831..ba80540e9f 100644 --- a/onadata/apps/logger/tasks.py +++ b/onadata/apps/logger/tasks.py @@ -17,8 +17,8 @@ commit_cached_elist_num_entities, dec_elist_num_entities, inc_elist_num_entities, - register_instance_export_repeats, - register_xform_export_repeats, + reconstruct_xform_export_register, + register_instance_repeat_columns, soft_delete_entities_bulk, ) from onadata.libs.utils.project_utils import set_project_perms_to_object @@ -116,8 +116,8 @@ def dec_elist_num_entities_async(elist_pk: int) -> None: @app.task(retry_backoff=3, autoretry_for=(DatabaseError, ConnectionError)) -def register_instance_export_repeats_async(instance_pk: int) -> None: - """Register export repeats asynchronously +def register_instance_repeat_columns_async(instance_pk: int) -> None: + """Register Instance repeat columns asynchronously :param instance_pk: Primary key for Instance """ @@ -128,12 +128,12 @@ def register_instance_export_repeats_async(instance_pk: int) -> None: logger.exception(exc) else: - register_instance_export_repeats(instance) + register_instance_repeat_columns(instance) @app.task(retry_backoff=3, autoretry_for=(DatabaseError, ConnectionError)) -def register_xform_export_repeats_async(xform_id: int) -> None: - """Register export repeats for an XForm asynchronously +def reconstruct_xform_export_register_async(xform_id: int) -> None: + """Register a XForm's Instances export columns asynchronously :param xform_id: Primary key for XForm """ @@ -144,4 +144,4 @@ def register_xform_export_repeats_async(xform_id: int) -> None: logger.exception(exc) else: - register_xform_export_repeats(xform) + reconstruct_xform_export_register(xform) diff --git a/onadata/apps/logger/tests/models/test_instance.py b/onadata/apps/logger/tests/models/test_instance.py index 7fafbf61e5..1b0db9ea4e 100644 --- a/onadata/apps/logger/tests/models/test_instance.py +++ b/onadata/apps/logger/tests/models/test_instance.py @@ -3,10 +3,13 @@ Test Instance model. """ +import json import os +from collections import OrderedDict from datetime import datetime, timedelta from unittest.mock import Mock, patch +from django.contrib.contenttypes.models import ContentType from django.http.request import HttpRequest from django.test import override_settings from django.utils.timezone import utc @@ -1211,8 +1214,8 @@ def test_xml_entity_node_missing(self): self.assertEqual(Entity.objects.count(), 0) - def test_register_repeats(self): - """Repeats are registered correctly""" + def test_repeat_columns_registered(self): + """Instance repeat columns are added to export columns register""" project = get_user_default_project(self.user) md = """ | survey | @@ -1229,6 +1232,34 @@ def test_register_repeats(self): | | Births | births | | """ xform = self._publish_markdown(md, self.user, project) + register = MetaData.objects.get( + data_type="export_columns_register", + object_id=xform.pk, + content_type=ContentType.objects.get_for_model(xform), + ) + # Default export columns are correctly registered + merged_multiples_columns = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + expected_columns = OrderedDict( + [ + ( + "hospital_repeat", + [], + ), + ( + "hospital_repeat/child_repeat", + [], + ), + ("meta/instanceID", None), + ] + ) + self.assertEqual(merged_multiples_columns, expected_columns) + self.assertEqual(split_multiples_columns, expected_columns) + xml = ( '' '' @@ -1302,6 +1361,39 @@ def test_register_repeats(self): instance.xml = xml instance.uuid = "51cb9e07-cfc7-413b-bc22-ee7adfa9dec4" instance.save() - metadata.refresh_from_db() - self.assertEqual(metadata.extra_data.get("hospital_repeat"), 3) - self.assertEqual(metadata.extra_data.get("child_repeat"), 2) + register.refresh_from_db() + merged_multiples_columns = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + expected_columns = OrderedDict( + [ + ( + "hospital_repeat", + [ + "hospital_repeat[1]/hospital", + "hospital_repeat[2]/hospital", + "hospital_repeat[3]/hospital", + ], + ), + ( + "hospital_repeat/child_repeat", + [ + "hospital_repeat[1]/child_repeat[1]/name", + "hospital_repeat[1]/child_repeat[1]/birthweight", + "hospital_repeat[1]/child_repeat[2]/name", + "hospital_repeat[1]/child_repeat[2]/birthweight", + "hospital_repeat[2]/child_repeat[1]/name", + "hospital_repeat[2]/child_repeat[1]/birthweight", + "hospital_repeat[3]/child_repeat[1]/name", + "hospital_repeat[3]/child_repeat[1]/birthweight", + ], + ), + ("meta/instanceID", None), + ] + ) + + self.assertEqual(merged_multiples_columns, expected_columns) + self.assertEqual(split_multiples_columns, expected_columns) diff --git a/onadata/apps/logger/tests/test_tasks.py b/onadata/apps/logger/tests/test_tasks.py index e4c249fffd..2c1ba88612 100644 --- a/onadata/apps/logger/tests/test_tasks.py +++ b/onadata/apps/logger/tests/test_tasks.py @@ -13,8 +13,8 @@ from onadata.apps.logger.tasks import ( apply_project_date_modified_async, commit_cached_elist_num_entities_async, - register_instance_export_repeats_async, - register_xform_export_repeats_async, + reconstruct_xform_export_register_async, + register_instance_repeat_columns_async, set_entity_list_perms_async, ) from onadata.apps.main.tests.test_base import TestBase @@ -157,9 +157,9 @@ def test_retry_database_error(self, mock_retry, mock_set_perms): self.assertTrue(isinstance(kwargs["exc"], DatabaseError)) -@patch("onadata.apps.logger.tasks.register_instance_export_repeats") -class RegisterInstanceExportRepeatsAsyncTestCase(TestBase): - """Tests for register_instance_export_repeats_async""" +@patch("onadata.apps.logger.tasks.register_instance_repeat_columns") +class RegisterInstanceRepeatColumnsAsyncTestCase(TestBase): + """Tests for register_instance_repeat_columns_async""" def setUp(self): super().setUp() @@ -168,29 +168,29 @@ def setUp(self): self._submit_transport_instance() self.instance = self.xform.instances.first() - def test_register_repeats(self, mock_register): - """Repeats are registered""" - register_instance_export_repeats_async.delay(self.instance.pk) + def test_register_columns(self, mock_register): + """Columns are registered""" + register_instance_repeat_columns_async.delay(self.instance.pk) mock_register.assert_called_once_with(self.instance) - @patch("onadata.apps.logger.tasks.register_instance_export_repeats_async.retry") + @patch("onadata.apps.logger.tasks.register_instance_repeat_columns_async.retry") def test_retry_connection_error(self, mock_retry, mock_register): """ConnectionError exception is retried""" mock_retry.side_effect = Retry mock_register.side_effect = ConnectionError - register_instance_export_repeats_async.delay(self.instance.pk) + register_instance_repeat_columns_async.delay(self.instance.pk) self.assertTrue(mock_retry.called) _, kwargs = mock_retry.call_args_list[0] self.assertTrue(isinstance(kwargs["exc"], ConnectionError)) - @patch("onadata.apps.logger.tasks.register_instance_export_repeats_async.retry") + @patch("onadata.apps.logger.tasks.register_instance_repeat_columns_async.retry") def test_retry_database_error(self, mock_retry, mock_register): """DatabaseError exception is retried""" mock_retry.side_effect = Retry mock_register.side_effect = DatabaseError - register_instance_export_repeats_async.delay(self.instance.pk) + register_instance_repeat_columns_async.delay(self.instance.pk) self.assertTrue(mock_retry.called) @@ -200,14 +200,14 @@ def test_retry_database_error(self, mock_retry, mock_register): @patch("onadata.apps.logger.tasks.logger.exception") def test_invalid_pk(self, mock_logger, mock_register): """Invalid Instance primary key is handled""" - register_instance_export_repeats_async.delay(sys.maxsize) + register_instance_repeat_columns_async.delay(sys.maxsize) mock_register.assert_not_called() mock_logger.assert_called_once() -@patch("onadata.apps.logger.tasks.register_xform_export_repeats") -class RegisterXFormExportRepeatsAsyncTestCase(TestBase): - """Tests for register_xform_export_repeats_async""" +@patch("onadata.apps.logger.tasks.reconstruct_xform_export_register") +class ReconstructXFormExportRegisterAsyncTestCase(TestBase): + """Tests for register_xform_export_register_async""" def setUp(self): super().setUp() @@ -215,29 +215,29 @@ def setUp(self): self._publish_transportation_form() self._submit_transport_instance() - def test_register_repeats(self, mock_register): - """Repeats are registered""" - register_xform_export_repeats_async.delay(self.xform.pk) + def test_register_columns(self, mock_register): + """Columns are registered""" + reconstruct_xform_export_register_async.delay(self.xform.pk) mock_register.assert_called_once_with(self.xform) - @patch("onadata.apps.logger.tasks.register_xform_export_repeats_async.retry") + @patch("onadata.apps.logger.tasks.reconstruct_xform_export_register_async.retry") def test_retry_connection_error(self, mock_retry, mock_register): """ConnectionError exception is retried""" mock_retry.side_effect = Retry mock_register.side_effect = ConnectionError - register_xform_export_repeats_async.delay(self.xform.pk) + reconstruct_xform_export_register_async.delay(self.xform.pk) self.assertTrue(mock_retry.called) _, kwargs = mock_retry.call_args_list[0] self.assertTrue(isinstance(kwargs["exc"], ConnectionError)) - @patch("onadata.apps.logger.tasks.register_xform_export_repeats_async.retry") + @patch("onadata.apps.logger.tasks.reconstruct_xform_export_register_async.retry") def test_retry_database_error(self, mock_retry, mock_register): """DatabaseError exception is retried""" mock_retry.side_effect = Retry mock_register.side_effect = DatabaseError - register_xform_export_repeats_async.delay(self.xform.pk) + reconstruct_xform_export_register_async.delay(self.xform.pk) self.assertTrue(mock_retry.called) @@ -247,6 +247,6 @@ def test_retry_database_error(self, mock_retry, mock_register): @patch("onadata.apps.logger.tasks.logger.exception") def test_invalid_pk(self, mock_logger, mock_register): """Invalid XForm primary key is handled""" - register_xform_export_repeats_async.delay(sys.maxsize) + reconstruct_xform_export_register_async.delay(sys.maxsize) mock_register.assert_not_called() mock_logger.assert_called_once() diff --git a/onadata/apps/main/models/meta_data.py b/onadata/apps/main/models/meta_data.py index b8f0f61cbf..e892147970 100644 --- a/onadata/apps/main/models/meta_data.py +++ b/onadata/apps/main/models/meta_data.py @@ -6,9 +6,12 @@ from __future__ import unicode_literals import hashlib +import importlib +import json import logging import mimetypes import os +from collections import OrderedDict from contextlib import closing from django.conf import settings @@ -30,6 +33,7 @@ safe_delete, ) from onadata.libs.utils.common_tags import ( + EXPORT_COLUMNS_REGISTER, GOOGLE_SHEET_DATA_TYPE, TEXTIT, TEXTIT_DETAILS, @@ -96,13 +100,22 @@ def get_default_content_type(): return content_object.id -def unique_type_for_form(content_object, data_type, data_value=None, data_file=None): +def unique_type_for_form( + content_object, data_type, data_value=None, data_file=None, extra_data=None +): """ Ensure that each metadata object has unique xform and data_type fields return the metadata object """ - defaults = {"data_value": data_value} if data_value else {} + defaults = {} + + if data_value: + defaults["data_value"] = data_value + + if extra_data: + defaults["extra_data"] = extra_data + content_type = ContentType.objects.get_for_model(content_object) if data_value is None and data_file is None: @@ -111,7 +124,7 @@ def unique_type_for_form(content_object, data_type, data_value=None, data_file=N object_id=content_object.id, content_type=content_type, data_type=data_type ).first() else: - result, _created = MetaData.objects.update_or_create( + result, _ = MetaData.objects.update_or_create( object_id=content_object.id, content_type=content_type, data_type=data_type, @@ -570,6 +583,27 @@ def instance_csv_imported_by(content_object, data_value=None): data_type = "imported_via_csv_by" return unique_type_for_form(content_object, data_type, data_value) + @staticmethod + def update_or_create_export_register(content_object, data_value=None): + """Update or create export columns register for XForm.""" + # Avoid cyclic import by using importlib + csv_builder = importlib.import_module("onadata.libs.utils.csv_builder") + ordered_columns = OrderedDict() + # pylint: disable=protected-access + csv_builder.CSVDataFrameBuilder._build_ordered_columns( + content_object._get_survey(), ordered_columns + ) + serialized_columns = json.dumps(ordered_columns) + data_type = EXPORT_COLUMNS_REGISTER + extra_data = { + "merged_multiples": serialized_columns, + "split_multiples": serialized_columns, + } + data_value = "" if data_value is None else data_value + return unique_type_for_form( + content_object, data_type, data_value=data_value, extra_data=extra_data + ) + # pylint: disable=unused-argument,invalid-name def clear_cached_metadata_instance_object( diff --git a/onadata/apps/main/tests/test_csv_export.py b/onadata/apps/main/tests/test_csv_export.py index aca24b173f..93a29d0bd7 100644 --- a/onadata/apps/main/tests/test_csv_export.py +++ b/onadata/apps/main/tests/test_csv_export.py @@ -13,7 +13,7 @@ from onadata.apps.logger.models import XForm from onadata.apps.main.tests.test_base import TestBase from onadata.apps.viewer.models import DataDictionary, Export -from onadata.apps.viewer.models.data_dictionary import create_export_repeat_register +from onadata.apps.viewer.models.data_dictionary import create_or_update_export_register from onadata.libs.utils.export_tools import generate_export @@ -31,15 +31,15 @@ def setUp(self): self.xform = None # Disable signals post_save.disconnect( - sender=DataDictionary, dispatch_uid="create_export_repeat_register" + sender=DataDictionary, dispatch_uid="create_or_update_export_register" ) def tearDown(self): # Reconnect signals post_save.connect( sender=DataDictionary, - dispatch_uid="create_export_repeat_register", - receiver=create_export_repeat_register, + dispatch_uid="create_or_update_export_register", + receiver=create_or_update_export_register, ) super().tearDown() diff --git a/onadata/apps/main/tests/test_form_metadata.py b/onadata/apps/main/tests/test_form_metadata.py index b9134ea74f..959dd90699 100644 --- a/onadata/apps/main/tests/test_form_metadata.py +++ b/onadata/apps/main/tests/test_form_metadata.py @@ -67,7 +67,7 @@ def _add_metadata(self, data_type="doc"): else: self.doc = ( MetaData.objects.all() - .exclude(data_type="export_repeat_register") + .exclude(data_type="export_columns_register") .reverse()[0] ) self.doc_url = reverse( diff --git a/onadata/apps/viewer/models/data_dictionary.py b/onadata/apps/viewer/models/data_dictionary.py index 2d3845dfba..ea1c845b71 100644 --- a/onadata/apps/viewer/models/data_dictionary.py +++ b/onadata/apps/viewer/models/data_dictionary.py @@ -35,7 +35,6 @@ PROJ_FORMS_CACHE, safe_delete, ) -from onadata.libs.utils.common_tags import EXPORT_REPEAT_REGISTER from onadata.libs.utils.model_tools import get_columns_with_hxl, set_uuid @@ -441,19 +440,19 @@ def invalidate_caches(sender, instance=None, created=False, **kwargs): ) -def create_export_repeat_register(sender, instance=None, created=False, **kwargs): - """Create export repeat register for the form""" - if created: - MetaData.objects.create( - content_type=ContentType.objects.get_for_model(instance), - object_id=instance.pk, - data_type=EXPORT_REPEAT_REGISTER, - data_value="", - ) +def create_or_update_export_register(sender, instance=None, created=False, **kwargs): + """Create or update export columns register for the form""" + # Avoid cyclic import by using importlib + logger_tasks = importlib.import_module("onadata.apps.logger.tasks") + + MetaData.update_or_create_export_register(instance) + + if not created: + logger_tasks.reconstruct_xform_export_register_async.delay(instance.pk) post_save.connect( - create_export_repeat_register, + create_or_update_export_register, sender=DataDictionary, - dispatch_uid="create_export_repeat_register", + dispatch_uid="create_or_update_export_register", ) diff --git a/onadata/apps/viewer/models/tests/test_data_dictionary.py b/onadata/apps/viewer/models/tests/test_data_dictionary.py index 3be2b032b1..d7a9b1d9c8 100644 --- a/onadata/apps/viewer/models/tests/test_data_dictionary.py +++ b/onadata/apps/viewer/models/tests/test_data_dictionary.py @@ -1,6 +1,8 @@ """Tests for onadata.apps.viewer.models.data_dictionary""" import json +from collections import OrderedDict +from unittest.mock import patch from django.contrib.contenttypes.models import ContentType from django.core.cache import cache @@ -309,14 +311,107 @@ def test_cache_invalidated(self): for key in cache_keys: self.assertIsNone(cache.get(key)) - def test_export_repeat_register_created(self): - """Export repeat register is created when form is published""" - xform = self._publish_markdown(self.registration_form, self.user) + def test_export_columns_register_created(self): + """Export columns register is created when form is published""" + md = """ + | survey | + | | type | name | label | + | | text | name | First Name | + | settings| | | | + | | form_title | form_id | | + | | Students | students | | + """ + xform = self._publish_markdown(md, self.user) content_type = ContentType.objects.get_for_model(xform) exists = MetaData.objects.filter( - data_type="export_repeat_register", + data_type="export_columns_register", object_id=xform.pk, content_type=content_type, ).exists() self.assertTrue(exists) + + register = MetaData.objects.get( + data_type="export_columns_register", + object_id=xform.pk, + content_type=content_type, + ) + merged_multiples_columns = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + expected_columns = OrderedDict( + [ + ("name", None), + ("meta/instanceID", None), + ] + ) + + self.assertEqual(merged_multiples_columns, expected_columns) + self.assertEqual(split_multiples_columns, expected_columns) + + @patch("onadata.apps.logger.tasks.reconstruct_xform_export_register_async.delay") + def test_export_columns_register_updated(self, mock_register_xform_columns): + """Export columns register is updated when form is replaced""" + md = """ + | survey | + | | type | name | label | + | | text | name | First Name | + | settings| | | | + | | form_title | form_id | | + | | Students | students | | + """ + xform = self._publish_markdown(md, self.user) + content_type = ContentType.objects.get_for_model(xform) + register = MetaData.objects.get( + data_type="export_columns_register", + object_id=xform.pk, + content_type=content_type, + ) + merged_multiples_columns = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + expected_columns = OrderedDict( + [ + ("name", None), + ("meta/instanceID", None), + ] + ) + + self.assertEqual(merged_multiples_columns, expected_columns) + self.assertEqual(split_multiples_columns, expected_columns) + # Replace form + md = """ + | survey | + | | type | name | label | + | | text | name | First Name | + | | text | age | Age | + | settings| | | | + | | form_title | form_id | | + | | Students | students | | + """ + self._replace_form(md, xform) + register.refresh_from_db() + merged_multiples_columns = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + expected_columns = OrderedDict( + [ + ("name", None), + ("age", None), + ("meta/instanceID", None), + ] + ) + + self.assertEqual(merged_multiples_columns, expected_columns) + self.assertEqual(split_multiples_columns, expected_columns) + # Task is called to add columns for repeat data + mock_register_xform_columns.assert_called_once_with(xform.pk) diff --git a/onadata/libs/tests/utils/test_csv_builder.py b/onadata/libs/tests/utils/test_csv_builder.py index abd1cb5b00..f5fca15f08 100644 --- a/onadata/libs/tests/utils/test_csv_builder.py +++ b/onadata/libs/tests/utils/test_csv_builder.py @@ -4,8 +4,10 @@ """ import csv +import json import os from builtins import chr, open +from collections import OrderedDict from tempfile import NamedTemporaryFile from unittest.mock import patch @@ -21,7 +23,7 @@ from onadata.apps.main.models.meta_data import MetaData from onadata.apps.main.tests.test_base import TestBase from onadata.apps.viewer.models import DataDictionary -from onadata.apps.viewer.models.data_dictionary import create_export_repeat_register +from onadata.apps.viewer.models.data_dictionary import create_or_update_export_register from onadata.libs.utils.common_tags import NA_REP from onadata.libs.utils.csv_builder import ( AbstractDataFrameBuilder, @@ -68,16 +70,24 @@ def setUp(self): self._submission_time = parse_datetime("2013-02-18 15:54:01Z") # Disable signals post_save.disconnect( - sender=DataDictionary, dispatch_uid="create_export_repeat_register" + sender=DataDictionary, dispatch_uid="create_or_update_export_register" ) + # Patch and start the mock + self.patcher = patch( + "onadata.libs.utils.csv_builder.reconstruct_xform_export_register_async.delay", + autospec=True, + ) + self.mock_register = self.patcher.start() def tearDown(self): # Enable signals post_save.connect( sender=DataDictionary, - dispatch_uid="create_export_repeat_register", - receiver=create_export_repeat_register, + dispatch_uid="create_or_update_export_register", + receiver=create_or_update_export_register, ) + # Stop the mock + self.patcher.stop() super().tearDown() @@ -120,6 +130,83 @@ def _publish_grouped_gps_form(self): # pylint: disable=attribute-defined-outside-init self.survey_name = "grouped_gps" + def _publish_select_multiples_grouped_repeating(self): + md = """ + | survey | + | | type | name | label | + | | text | name | Name | + | | integer | age | Age | + | | begin repeat | browser_use | Browser Use | + | | integer | year | Year | + | | select_multiple browsers | browsers | Browsers | + | | end repeat | | | + + | choices | + | | list name | name | label | + | | browsers | firefox | Firefox | + | | browsers | chrome | Chrome | + | | browsers | ie | Internet Explorer | + | | browsers | safari | Safari | + """ + xform = self._publish_markdown(md, self.user, id_string="browser_use") + return xform + + def _register_select_multiples_grouped_repeating(self, xform): + MetaData.objects.create( + content_type=ContentType.objects.get_for_model(xform), + object_id=xform.pk, + data_type="export_columns_register", + data_value="", + extra_data={ + "merged_multiples": json.dumps( + OrderedDict( + [ + ("name", None), + ("age", None), + ( + "browser_use", + ["browser_use[1]/year", "browser_use[1]/browsers"], + ), + ("meta/instanceID", None), + ] + ) + ), + "split_multiples": json.dumps( + OrderedDict( + [ + ("name", None), + ("age", None), + ( + "browser_use", + [ + "browser_use[1]/year", + "browser_use[1]/browsers/firefox", + "browser_use[1]/browsers/chrome", + "browser_use[1]/browsers/ie", + "browser_use[1]/browsers/safari", + ], + ), + ("meta/instanceID", None), + ] + ) + ), + }, + ) + cursor = [ + { + "name": "Tom", + "age": 23, + "browser_use": [ + { + "browser_use/year": "2010", + "browser_use/browsers": "firefox safari", + }, + ], + } + ] + + return cursor + def _csv_data_for_dataframe(self): csv_df_builder = CSVDataFrameBuilder( self.user.username, self.xform.id_string, include_images=False @@ -129,8 +216,7 @@ def _csv_data_for_dataframe(self): ) return [d for d in csv_df_builder._format_for_dataframe(cursor)] - @patch("onadata.libs.utils.csv_builder.register_xform_export_repeats_async.delay") - def test_csv_dataframe_export_to(self, mock_register_repeats): + def test_csv_dataframe_export_to(self): """ Test CSVDataFrameBuilder.export_to(). """ @@ -160,8 +246,6 @@ def test_csv_dataframe_export_to(self, mock_register_repeats): with open(temp_file.name) as csv_file: self._test_csv_files(csv_file, csv_fixture_path) os.unlink(temp_file.name) - # Repeat register is created for future use - mock_register_repeats.assert_called() # pylint: disable=invalid-name def test_csv_columns_for_gps_within_groups(self): @@ -2167,69 +2251,10 @@ def test_extra_columns_dataview(self): header = next(csv_reader) self.assertEqual(header, ["age", extra_col]) - def test_registered_repeats(self): - """Registered repeats are used to generate export""" - md_xform = """ - | survey | - | | type | name | label | - | | begin repeat | hospital_repeat | | - | | text | hospital | Hospital Name | - | | begin repeat | child_repeat | | - | | text | name | Child Name | - | | decimal | birthweight | Birth Weight | - | | select_one male_female | sex | Child sex | - | | end repeat | | | - | | end repeat | | | - | choices | | | | - | | list name | name | label | - | | male_female | male | Male | - | | male_female | female | Female | - """ - self._publish_markdown(md_xform, self.user, id_string="nested_repeats") - xform = XForm.objects.get(user=self.user, id_string="nested_repeats") - cursor = [ - { - "hospital_repeat": [ - { - "hospital_repeat/hospital": "Aga Khan", - "hospital_repeat/child_repeat": [ - { - "hospital_repeat/child_repeat/sex": "male", - "hospital_repeat/child_repeat/name": "Zakayo", - "hospital_repeat/child_repeat/birthweight": 3.3, - }, - { - "hospital_repeat/child_repeat/sex": "female", - "hospital_repeat/child_repeat/name": "Melania", - "hospital_repeat/child_repeat/birthweight": 3.5, - }, - ], - }, - { - "hospital_repeat/hospital": "Mama Lucy", - "hospital_repeat/child_repeat": [ - { - "hospital_repeat/child_repeat/sex": "female", - "hospital_repeat/child_repeat/name": "Winnie", - "hospital_repeat/child_repeat/birthweight": 3.1, - } - ], - }, - ], - } - ] - content_type = ContentType.objects.get_for_model(xform) - # Simulate registered repeats (Repeats are normally registered from incoming submissions) - MetaData.objects.create( - content_type=content_type, - object_id=xform.pk, - data_type="export_repeat_register", - data_value="", - extra_data={ - "hospital_repeat": 2, - "child_repeat": 2, - }, - ) + def test_export_register_split_multiples(self): + """Export register works with split multiples""" + xform = self._publish_select_multiples_grouped_repeating() + cursor = self._register_select_multiples_grouped_repeating(xform) builder = CSVDataFrameBuilder( self.user.username, xform.id_string, @@ -2241,20 +2266,13 @@ def test_registered_repeats(self): csv_reader = csv.reader(csv_file) header = next(csv_reader) expected_header = [ - "hospital_repeat[1]/hospital", - "hospital_repeat[2]/hospital", - "hospital_repeat[1]/child_repeat[1]/name", - "hospital_repeat[1]/child_repeat[1]/birthweight", - "hospital_repeat[1]/child_repeat[1]/sex", - "hospital_repeat[1]/child_repeat[2]/name", - "hospital_repeat[1]/child_repeat[2]/birthweight", - "hospital_repeat[1]/child_repeat[2]/sex", - "hospital_repeat[2]/child_repeat[1]/name", - "hospital_repeat[2]/child_repeat[1]/birthweight", - "hospital_repeat[2]/child_repeat[1]/sex", - "hospital_repeat[2]/child_repeat[2]/name", - "hospital_repeat[2]/child_repeat[2]/birthweight", - "hospital_repeat[2]/child_repeat[2]/sex", + "name", + "age", + "browser_use[1]/year", + "browser_use[1]/browsers/firefox", + "browser_use[1]/browsers/chrome", + "browser_use[1]/browsers/ie", + "browser_use[1]/browsers/safari", "meta/instanceID", "_id", "_uuid", @@ -2272,17 +2290,17 @@ def test_registered_repeats(self): self.assertCountEqual(header, expected_header) row = next(csv_reader) expected_row = [ - "Aga Khan", - "Mama Lucy", - "Zakayo", - "3.3", - "male", - "Melania", - "3.5", - "female", - "Winnie", - "3.1", - "female", + "Tom", + "23", + "2010", + "True", + "False", + "False", + "True", + "n/a", + "n/a", + "n/a", + "n/a", "n/a", "n/a", "n/a", @@ -2292,6 +2310,57 @@ def test_registered_repeats(self): "n/a", "n/a", "n/a", + ] + self.assertEqual(row, expected_row) + csv_file.close() + + def test_export_register_merged_multiples(self): + """Export register works with merged multiples""" + xform = self._publish_select_multiples_grouped_repeating() + cursor = self._register_select_multiples_grouped_repeating(xform) + builder = CSVDataFrameBuilder( + self.user.username, + xform.id_string, + include_images=False, + split_select_multiples=False, + ) + temp_file = NamedTemporaryFile(suffix=".csv", delete=False) + builder.export_to(temp_file.name, cursor) + csv_file = open(temp_file.name, "r") + csv_reader = csv.reader(csv_file) + header = next(csv_reader) + expected_header = [ + "name", + "age", + "browser_use[1]/year", + "browser_use[1]/browsers", + "meta/instanceID", + "_id", + "_uuid", + "_submission_time", + "_date_modified", + "_tags", + "_notes", + "_version", + "_duration", + "_submitted_by", + "_total_media", + "_media_count", + "_media_all_received", + ] + self.assertCountEqual(header, expected_header) + row = next(csv_reader) + expected_row = [ + "Tom", + "23", + "2010", + "firefox safari", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", "n/a", "n/a", "n/a", @@ -2300,11 +2369,11 @@ def test_registered_repeats(self): "n/a", "n/a", ] - self.assertEqual(row, expected_row) + self.assertCountEqual(row, expected_row) csv_file.close() - def test_repeat_not_found_in_register(self): - """Repeat not found in register""" + def test_export_columns_register_missing(self): + """Export columns register not found""" md_xform = """ | survey | | | type | name | label | @@ -2355,16 +2424,14 @@ def test_repeat_not_found_in_register(self): } ] content_type = ContentType.objects.get_for_model(xform) - # Simulate registered repeats (Repeats are normally registered from incoming submissions) - metadata = MetaData.objects.create( + exists = MetaData.objects.filter( content_type=content_type, object_id=xform.pk, - data_type="export_repeat_register", - data_value="", - extra_data={ - "hospital_repeat": 2, # nested child_repeat not registered - }, - ) + data_type="export_columns_register", + ).exists() + # Confirm register is not found + self.assertFalse(exists) + builder = CSVDataFrameBuilder( self.user.username, xform.id_string, @@ -2378,6 +2445,15 @@ def test_repeat_not_found_in_register(self): expected_header = [ "hospital_repeat[1]/hospital", "hospital_repeat[2]/hospital", + "hospital_repeat[1]/child_repeat[1]/name", + "hospital_repeat[1]/child_repeat[1]/birthweight", + "hospital_repeat[1]/child_repeat[1]/sex", + "hospital_repeat[1]/child_repeat[2]/name", + "hospital_repeat[1]/child_repeat[2]/birthweight", + "hospital_repeat[1]/child_repeat[2]/sex", + "hospital_repeat[2]/child_repeat[1]/name", + "hospital_repeat[2]/child_repeat[1]/birthweight", + "hospital_repeat[2]/child_repeat[1]/sex", "meta/instanceID", "_id", "_uuid", @@ -2393,18 +2469,34 @@ def test_repeat_not_found_in_register(self): "_media_all_received", ] self.assertCountEqual(header, expected_header) - # Parent repeat not registered - metadata.extra_data = {"child_repeat": 2} - metadata.save - builder = CSVDataFrameBuilder( - self.user.username, - xform.id_string, - include_images=False, - ) - temp_file = NamedTemporaryFile(suffix=".csv", delete=False) - builder.export_to(temp_file.name, cursor) - csv_file = open(temp_file.name, "r") - csv_reader = csv.reader(csv_file) - header = next(csv_reader) - self.assertCountEqual(header, expected_header) + row = next(csv_reader) + expected_row = [ + "Aga Khan", + "Mama Lucy", + "Zakayo", + "3.3", + "male", + "Melania", + "3.5", + "female", + "Winnie", + "3.1", + "female", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + "n/a", + ] + self.assertEqual(row, expected_row) csv_file.close() + # Columns registered for future use + self.mock_register.assert_called_once_with(xform.pk) diff --git a/onadata/libs/tests/utils/test_logger_tools.py b/onadata/libs/tests/utils/test_logger_tools.py index ba25a95838..12bdc1586f 100644 --- a/onadata/libs/tests/utils/test_logger_tools.py +++ b/onadata/libs/tests/utils/test_logger_tools.py @@ -3,8 +3,10 @@ Test logger_tools utility functions. """ +import json import os import re +from collections import OrderedDict from datetime import datetime, timedelta from io import BytesIO from unittest.mock import Mock, call, patch @@ -44,8 +46,8 @@ generate_content_disposition_header, get_first_record, inc_elist_num_entities, - register_instance_export_repeats, - register_xform_export_repeats, + reconstruct_xform_export_register, + register_instance_repeat_columns, safe_create_instance, ) from onadata.libs.utils.user_auth import get_user_default_project @@ -1159,14 +1161,16 @@ def test_cache_deleted(self): self.assertIsNone(cache.get(f"xfm-submissions-deleting-{self.xform.id}")) -class RegisterInstanceExportRepeatsTestCase(TestBase): - """Tests for method `register_instance_export_repeats`""" +class RegisterInstanceRepeatColumnsTestCase(TestBase): + """Tests for method `register_instance_repeat_columns`""" def setUp(self): super().setUp() # Disable signals - post_save.disconnect(sender=Instance, dispatch_uid="register_export_repeats") + post_save.disconnect( + sender=Instance, dispatch_uid="register_instance_repeat_columns" + ) self.project = get_user_default_project(self.user) md = """ @@ -1216,105 +1220,196 @@ def setUp(self): xml=self.xml, user=self.user, xform=self.xform ) self.register = MetaData.objects.get( - data_type="export_repeat_register", + data_type="export_columns_register", object_id=self.xform.pk, content_type=ContentType.objects.get_for_model(self.xform), ) - def test_repeat_register_not_found(self): - """Nothing happens if export repeat register is not found""" - self.register.delete() - register_instance_export_repeats(self.instance) - - exists = MetaData.objects.filter(data_type="export_repeat_register").exists() - self.assertFalse(exists) + def test_columns_added(self): + """Incoming columns are added to the register""" + merged_multiples = json.loads( + self.register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples = json.loads( + self.register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + # Before Instance repeat columns are added + expected_columns = OrderedDict( + [ + ( + "hospital_repeat", + [], + ), + ( + "hospital_repeat/child_repeat", + [], + ), + ("meta/instanceID", None), + ] + ) + self.assertEqual(merged_multiples, expected_columns) + self.assertEqual(split_multiples, expected_columns) - def test_incoming_repeat_max_greater(self): - """Repeat count is incremented if incoming repeat count is greater""" - self.register.extra_data = { - "hospital_repeat": 1, - "child_repeat": 1, - } - self.register.save() - register_instance_export_repeats(self.instance) + register_instance_repeat_columns(self.instance) + # After Instance repeat columns are added self.register.refresh_from_db() - self.assertEqual(self.register.extra_data.get("hospital_repeat"), 2) - self.assertEqual(self.register.extra_data.get("child_repeat"), 2) - - def test_incoming_repeat_max_less(self): - """Repeat count is unchanged if incoming repeat count is less""" - self.register.extra_data = { - "hospital_repeat": 3, - "child_repeat": 3, - } - self.register.save() - register_instance_export_repeats(self.instance) + merged_multiples = json.loads( + self.register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples = json.loads( + self.register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + expected_columns = OrderedDict( + [ + ( + "hospital_repeat", + ["hospital_repeat[1]/hospital", "hospital_repeat[2]/hospital"], + ), + ( + "hospital_repeat/child_repeat", + [ + "hospital_repeat[1]/child_repeat[1]/name", + "hospital_repeat[1]/child_repeat[1]/birthweight", + "hospital_repeat[1]/child_repeat[2]/name", + "hospital_repeat[1]/child_repeat[2]/birthweight", + "hospital_repeat[2]/child_repeat[1]/name", + "hospital_repeat[2]/child_repeat[1]/birthweight", + ], + ), + ("meta/instanceID", None), + ] + ) - self.register.refresh_from_db() - # repeat counts remain unchanged - self.assertEqual(self.register.extra_data.get("hospital_repeat"), 3) - self.assertEqual(self.register.extra_data.get("child_repeat"), 3) - - def test_incoming_repeat_max_equal(self): - """Repeat count is unchanged if incoming repeat count is equal""" - self.register.extra_data = { - "hospital_repeat": 2, - "child_repeat": 2, - } - self.register.save() - register_instance_export_repeats(self.instance) + self.assertEqual(merged_multiples, expected_columns) + self.assertEqual(split_multiples, expected_columns) - self.register.refresh_from_db() - # repeat counts remain unchanged - self.assertEqual(self.register.extra_data.get("hospital_repeat"), 2) - self.assertEqual(self.register.extra_data.get("child_repeat"), 2) + def test_register_not_found(self): + """Nothing happens if export columns register is not found""" + self.register.delete() + register_instance_repeat_columns(self.instance) - def test_no_repeats(self): - """No change in register if no repeats are found in the instance""" + exists = MetaData.objects.filter(data_type="export_columns_register").exists() + self.assertFalse(exists) + + def test_select_multiples(self): + """Columns for a form with select multiples are added""" md = """ | survey | - | | type | name | label | - | | text | hospital | Name of hospital | - | settings| | | | - | | form_title | form_id | | - | | Births | births | | + | | type | name | label | + | | text | name | Name | + | | integer | age | Age | + | | begin repeat | browser_use | Browser Use | + | | integer | year | Year | + | | select_multiple browsers | browsers | Browsers | + | | end repeat | | | + + | choices | + | | list name | name | label | + | | browsers | firefox | Firefox | + | | browsers | chrome | Chrome | + | | browsers | ie | Internet Explorer | + | | browsers | safari | Safari | """ xform = self._publish_markdown( - md, self.user, self.project, id_string="no-repeats" + md, self.user, self.project, id_string="browser_use" ) + register = MetaData.objects.get( + data_type="export_columns_register", + object_id=xform.pk, + content_type=ContentType.objects.get_for_model(xform), + ) + # Before Instance repeat columns are added + merged_multiples_columns = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + expected_columns = OrderedDict( + [ + ("name", None), + ("age", None), + ("browser_use", []), + ("meta/instanceID", None), + ] + ) + + self.assertEqual(merged_multiples_columns, expected_columns) + self.assertEqual(split_multiples_columns, expected_columns) + xml = ( '' '' f"{xform.uuid}" - "Aga Khan" + "John Doe" + "25" + "" + "2021" + "firefox chrome" + "" "" - "uuid:45d27780-48fd-4035-8655-9332649385bd" + "uuid:cea7954a-60d5-4f40-b844-080733a74a34" "" "" ) instance = Instance.objects.create(xml=xml, user=self.user, xform=xform) - register = MetaData.objects.get( - content_type=ContentType.objects.get_for_model(xform), - object_id=self.xform.id, - data_type="export_repeat_register", - ) - register_instance_export_repeats(instance) + register_instance_repeat_columns(instance) + + # After Instance repeat columns are added register.refresh_from_db() + merged_multiples_columns = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) - self.assertEqual(register.extra_data, {}) + self.assertEqual( + split_multiples_columns, + OrderedDict( + [ + ("name", None), + ("age", None), + ( + "browser_use", + [ + "browser_use[1]/year", + "browser_use[1]/browsers/firefox", + "browser_use[1]/browsers/chrome", + "browser_use[1]/browsers/ie", + "browser_use[1]/browsers/safari", + ], + ), + ("meta/instanceID", None), + ] + ), + ) + self.assertEqual( + merged_multiples_columns, + OrderedDict( + [ + ("name", None), + ("age", None), + ("browser_use", ["browser_use[1]/year", "browser_use[1]/browsers"]), + ("meta/instanceID", None), + ] + ), + ) -class RegisterXFormExportRepeatsTestCase(TestBase): - """Tests for method `register_xform_export_repeats`""" +class ReconstructXFormExportRegisterTestCase(TestBase): + """Tests for method `reconstruct_xform_export_register`""" def setUp(self): super().setUp() # Disable signals - post_save.disconnect(sender=Instance, dispatch_uid="register_export_repeats") + post_save.disconnect( + sender=Instance, dispatch_uid="register_instance_repeat_columns" + ) self.project = get_user_default_project(self.user) md = """ @@ -1364,17 +1459,91 @@ def setUp(self): xml=xml, user=self.user, xform=self.xform ) self.register = MetaData.objects.get( - data_type="export_repeat_register", + data_type="export_columns_register", object_id=self.xform.pk, content_type=ContentType.objects.get_for_model(self.xform), ) + self.expected_columns = OrderedDict( + [ + ( + "hospital_repeat", + ["hospital_repeat[1]/hospital", "hospital_repeat[2]/hospital"], + ), + ( + "hospital_repeat/child_repeat", + [ + "hospital_repeat[1]/child_repeat[1]/name", + "hospital_repeat[1]/child_repeat[1]/birthweight", + "hospital_repeat[1]/child_repeat[2]/name", + "hospital_repeat[1]/child_repeat[2]/birthweight", + "hospital_repeat[2]/child_repeat[1]/name", + "hospital_repeat[2]/child_repeat[1]/birthweight", + ], + ), + ("meta/instanceID", None), + ] + ) def test_register(self): """Repeats from all instances are registered""" - self.assertEqual(self.register.extra_data, {}) + merged_multiples_columns = json.loads( + self.register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + self.register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + # Before reconstructing export columns register + expected_columns = OrderedDict( + [ + ( + "hospital_repeat", + [], + ), + ( + "hospital_repeat/child_repeat", + [], + ), + ("meta/instanceID", None), + ] + ) - register_xform_export_repeats(self.xform) + self.assertEqual(merged_multiples_columns, expected_columns) + self.assertEqual(split_multiples_columns, expected_columns) + + reconstruct_xform_export_register(self.xform) + + # After reconstructing register + self.register.refresh_from_db() + merged_multiples_columns = json.loads( + self.register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + self.register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + + self.assertEqual(merged_multiples_columns, self.expected_columns) + self.assertEqual(split_multiples_columns, self.expected_columns) + + def test_register_not_found(self): + """Register is created if not found""" + self.register.delete() + reconstruct_xform_export_register(self.xform) + + exists = MetaData.objects.filter(data_type="export_columns_register").exists() + + self.assertTrue(exists) + + register = MetaData.objects.get( + data_type="export_columns_register", + object_id=self.xform.pk, + content_type=ContentType.objects.get_for_model(self.xform), + ) + merged_multiples_columns = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples_columns = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) - metadata = MetaData.objects.get(data_type="export_repeat_register") - self.assertEqual(metadata.extra_data.get("hospital_repeat"), 2) - self.assertEqual(metadata.extra_data.get("child_repeat"), 2) + self.assertEqual(merged_multiples_columns, self.expected_columns) + self.assertEqual(split_multiples_columns, self.expected_columns) diff --git a/onadata/libs/utils/common_tags.py b/onadata/libs/utils/common_tags.py index 69ee019ba7..1dcb032155 100644 --- a/onadata/libs/utils/common_tags.py +++ b/onadata/libs/utils/common_tags.py @@ -211,4 +211,4 @@ XFORM_CREATION_EVENT = "XForm created" PROJECT_CREATION_EVENT = "Project created" USER_CREATION_EVENT = "User account created" -EXPORT_REPEAT_REGISTER = "export_repeat_register" +EXPORT_COLUMNS_REGISTER = "export_columns_register" diff --git a/onadata/libs/utils/csv_builder.py b/onadata/libs/utils/csv_builder.py index 8708169480..391ef4adb8 100644 --- a/onadata/libs/utils/csv_builder.py +++ b/onadata/libs/utils/csv_builder.py @@ -3,6 +3,7 @@ CSV export utility functions. """ +import json from collections import OrderedDict from itertools import chain, tee @@ -17,7 +18,7 @@ from onadata.apps.logger.models import EntityList, OsmData from onadata.apps.logger.models.xform import XForm, question_types_to_exclude -from onadata.apps.logger.tasks import register_xform_export_repeats_async +from onadata.apps.logger.tasks import reconstruct_xform_export_register_async from onadata.apps.main.models.meta_data import MetaData from onadata.apps.viewer.models.data_dictionary import DataDictionary from onadata.libs.utils.common_tags import ( @@ -27,7 +28,7 @@ DELETEDAT, DURATION, EDITED, - EXPORT_REPEAT_REGISTER, + EXPORT_COLUMNS_REGISTER, GEOLOCATION, ID, MEDIA_ALL_RECEIVED, @@ -747,13 +748,8 @@ def _build_ordered_columns( # generated when we reindex ordered_columns[child.get_abbreviated_xpath()] = None - def _update_ordered_columns_from_data(self, cursor): - """ - Populates `self.ordered_columns` object that is - used to generate export column headers for - forms that split select multiple and gps data. - """ - # add ordered columns for select multiples + def _add_ordered_columns_for_select_multiples(self): + """Add ordered columns for select multiples""" if self.split_select_multiples: for key, choices in iteritems(self.select_multiples): # HACK to ensure choices are NOT duplicated @@ -769,88 +765,31 @@ def _update_ordered_columns_from_data(self, cursor): ] ) - # add ordered columns for gps fields + def _add_ordered_columns_for_gps_fields(self): + """Add ordered columns for gps fields""" for key in self.gps_fields: gps_xpaths = self.data_dictionary.get_additional_geopoint_xpaths(key) self.ordered_columns[key] = [key] + gps_xpaths - # Add ordered columns for nested repeat data - content_type = ContentType.objects.get_for_model(self.xform) - - def build_columns_from_data(): - """Build repeat columns from data.""" - for record in cursor: - # re index column repeats - for key, value in iteritems(record): - self._reindex( - key, - value, - self.ordered_columns, - record, - self.data_dictionary, - include_images=self.image_xpaths, - split_select_multiples=self.split_select_multiples, - index_tags=self.index_tags, - show_choice_labels=self.show_choice_labels, - language=self.language, - host=self.host, - ) - - # Register repeat columns for future use - register_xform_export_repeats_async.delay(self.xform.pk) - - def build_columns_from_register(repeat_xpath, num_repeats, parent_prefix=None): - """Build repeat columns from register.""" - for index in range(1, num_repeats + 1): - child_elements = self.data_dictionary.get_child_elements( - repeat_xpath, self.split_select_multiples + def _add_ordered_columns_for_repeat_data(self, cursor): + """Add ordered columns for nested repeat data""" + for record in cursor: + # re index column repeats + for key, value in iteritems(record): + self._reindex( + key, + value, + self.ordered_columns, + record, + self.data_dictionary, + include_images=self.image_xpaths, + split_select_multiples=self.split_select_multiples, + index_tags=self.index_tags, + show_choice_labels=self.show_choice_labels, + language=self.language, + host=self.host, ) - if parent_prefix is None: - prefix = f"{repeat_xpath}[{index}]" - - else: - repeat_name = repeat_xpath.split("/")[-1] - prefix = f"{parent_prefix}/{repeat_name}[{index}]" - - for element in child_elements: - if not question_types_to_exclude(element.type): - if not isinstance(element, RepeatingSection): - self.ordered_columns[repeat_xpath].append( - f"{prefix}/{element.name}" - ) - - else: - child_repeat_xpath = element.get_abbreviated_xpath() - child_repeat_name = child_repeat_xpath.split("/")[-1] - child_repeat_num = repeat_register.extra_data.get( - child_repeat_name, 0 - ) - if child_repeat_num: - build_columns_from_register( - child_repeat_xpath, child_repeat_num, prefix - ) - - try: - repeat_register = MetaData.objects.get( - content_type=content_type, - object_id=self.xform.pk, - data_type=EXPORT_REPEAT_REGISTER, - ) - - except MetaData.DoesNotExist: - build_columns_from_data() - else: - # Build repeat columns from register, start from parent repeat and - # recurse into children repeats - for column_xpath, value in self.ordered_columns.items(): - if isinstance(value, list) and not value: - repeat_name = column_xpath.split("/")[-1] - repeat_num = repeat_register.extra_data.get(repeat_name, 0) - - if repeat_num: - build_columns_from_register(column_xpath, repeat_num) - def _format_for_dataframe(self, cursor): """ Unpacks nested repeat data for export. @@ -895,13 +834,39 @@ def export_to(self, path, cursor, dataview=None): columns_with_hxl = None if self.entity_list is None: - self.ordered_columns = OrderedDict() - self._build_ordered_columns( - self.data_dictionary.survey, self.ordered_columns - ) + content_type = ContentType.objects.get_for_model(self.xform) # creator copy of iterator cursor cursor, ordered_col_cursor = tee(cursor) - self._update_ordered_columns_from_data(ordered_col_cursor) + + try: + columns_register = MetaData.objects.get( + content_type=content_type, + object_id=self.xform.pk, + data_type=EXPORT_COLUMNS_REGISTER, + ) + + except MetaData.DoesNotExist: + self._build_ordered_columns( + self.data_dictionary.survey, self.ordered_columns + ) + self._add_ordered_columns_for_repeat_data(ordered_col_cursor) + # Register export columns for future use + reconstruct_xform_export_register_async.delay(self.xform.pk) + + else: + serialized_columns = columns_register.extra_data.get("split_multiples") + + if not self.split_select_multiples: + serialized_columns = columns_register.extra_data.get( + "merged_multiples" + ) + + self.ordered_columns = json.loads( + serialized_columns, object_pairs_hook=OrderedDict + ) + + self._add_ordered_columns_for_select_multiples() + self._add_ordered_columns_for_gps_fields() # Unpack xform columns and data data = self._format_for_dataframe(cursor) diff --git a/onadata/libs/utils/logger_tools.py b/onadata/libs/utils/logger_tools.py index 8d738bac2b..2061b037d4 100644 --- a/onadata/libs/utils/logger_tools.py +++ b/onadata/libs/utils/logger_tools.py @@ -4,6 +4,7 @@ logger_tools - Logger app utility functions. """ +import importlib import json import logging import os @@ -11,6 +12,7 @@ import sys import tempfile from builtins import str as text +from collections import OrderedDict from datetime import datetime, timedelta from hashlib import sha256 from http.client import BadStatusLine @@ -29,7 +31,7 @@ ValidationError, ) from django.core.files.storage import get_storage_class -from django.db import DataError, IntegrityError, connection, transaction +from django.db import DataError, IntegrityError, transaction from django.db.models import F, Q from django.db.models.query import QuerySet from django.http import ( @@ -109,7 +111,7 @@ safe_delete, set_cache_with_lock, ) -from onadata.libs.utils.common_tags import EXPORT_REPEAT_REGISTER, METADATA_FIELDS +from onadata.libs.utils.common_tags import EXPORT_COLUMNS_REGISTER, METADATA_FIELDS from onadata.libs.utils.common_tools import get_uuid, report_exception from onadata.libs.utils.model_tools import queryset_iterator, set_uuid from onadata.libs.utils.user_auth import get_user_default_project @@ -1524,105 +1526,93 @@ def delete_xform_submissions( ) -def _get_instance_repeat_max(instance: Instance) -> dict[str, int]: - """Get the maximum number of occurrences for each repeat group +def _register_instance_repeat_columns(instance: Instance, register: MetaData) -> None: + """Add Instance repeat columns to the export columns register :param instance: Instance object - :return: Dictionary of repeat counts + :param metadata: MetaData object that stores the export repeat register """ - repeat_max = {} - instance_json = instance.get_dict() + # Avoid cyclic import by using importlib + csv_builder_module = importlib.import_module("onadata.libs.utils.csv_builder") + + with transaction.atomic(): + # We use select_for_update to acquire a row-level lock + # Only one process updates it at a time. This prevents race conditions + # and updates extra_data atomically + register = MetaData.objects.select_for_update().get(pk=register.pk) + merged_multiples = json.loads( + register.extra_data["merged_multiples"], object_pairs_hook=OrderedDict + ) + split_multiples = json.loads( + register.extra_data["split_multiples"], object_pairs_hook=OrderedDict + ) + xform = instance.xform + csv_builder_module = csv_builder_module.CSVDataFrameBuilder( + xform=xform, username=xform.user.username, id_string=xform.id_string + ) + data = instance.get_full_dict() + changes = { + "merged_multiples": merged_multiples, + "split_multiples": split_multiples, + } - def _get_repeat_max(data): for key, value in data.items(): - if isinstance(value, list): - repeat_name = key.split("/")[-1] - repeat_max[repeat_name] = max( - len(value), repeat_max.get(repeat_name, 0) - ) - - for item in value: - if isinstance(item, dict): - _get_repeat_max(item) - - _get_repeat_max(instance_json) - - return repeat_max - - -def _update_export_repeat_register(instance: Instance, metadata: MetaData) -> None: - """Update the export repeat register: - - :param instance: Instance object - :param metadata: MetaData object that stores the export repeat register - """ - repeat_max = _get_instance_repeat_max(instance) - - for repeat, incoming_max in repeat_max.items(): - # Get the maximum between incoming max and the current max - # Done at database level to gurantee atomicity and - # consistency. Avoids race conditions if it were done at the - # application level - - with connection.cursor() as cursor: - cursor.execute( - """ - UPDATE main_metadata - SET extra_data = jsonb_set( - COALESCE(extra_data, '{}'::jsonb), - %s, - GREATEST( - COALESCE((extra_data->>%s)::int, 0), - %s - )::text::jsonb, - true - ) - WHERE id = %s - """, - [ - [repeat], - repeat, - incoming_max, - metadata.pk, - ], + # Reindex split multiples + # pylint: disable=protected-access + csv_builder_module._reindex( + key, + value, + changes["split_multiples"], + data, + xform, + include_images=[], + split_select_multiples=True, + ) + # Reindex merged multiples + # pylint: disable=protected-access + csv_builder_module._reindex( + key, + value, + changes["merged_multiples"], + data, + xform, + include_images=[], + split_select_multiples=False, ) + register.extra_data = {key: json.dumps(value) for key, value in changes.items()} + register.save() + @transaction.atomic() -def register_instance_export_repeats(instance: Instance) -> None: - """Register an Instance's repeats for export +def register_instance_repeat_columns(instance: Instance) -> None: + """Add an Instance repeat columns to the export columns register :param instance: Instance object """ content_type = ContentType.objects.get_for_model(instance.xform) try: - metadata = MetaData.objects.get( + register = MetaData.objects.get( content_type=content_type, object_id=instance.xform.pk, - data_type=EXPORT_REPEAT_REGISTER, + data_type=EXPORT_COLUMNS_REGISTER, ) except MetaData.DoesNotExist: return - _update_export_repeat_register(instance, metadata) + _register_instance_repeat_columns(instance, register) @transaction.atomic() -def register_xform_export_repeats(xform: XForm) -> None: - """Register a XForm's Instances repeats for export +def reconstruct_xform_export_register(xform: XForm) -> None: + """Reconstruct the export columns register for an XForm :param xform: XForm object """ - content_type = ContentType.objects.get_for_model(xform) - metadata, _ = MetaData.objects.get_or_create( - content_type=content_type, - object_id=xform.pk, - data_type=EXPORT_REPEAT_REGISTER, - defaults={"data_value": ""}, - ) + register = MetaData.update_or_create_export_register(xform) instance_qs = xform.instances.filter(deleted_at__isnull=True) - for instance in queryset_iterator(instance_qs): - _update_export_repeat_register(instance, metadata) + for instance in queryset_iterator(instance_qs, chunksize=500): + _register_instance_repeat_columns(instance, register)