diff --git a/adsmp/app.py b/adsmp/app.py index b3fbf9a..994ff6d 100644 --- a/adsmp/app.py +++ b/adsmp/app.py @@ -894,7 +894,7 @@ def should_include_in_sitemap(self, record): 3. If processed, processing isn't too stale Args: - record: Dictionary with record data including bib_data, status, timestamps + record: Dictionary with record data including has_bib_data, status, timestamps Returns: bool: True if record should be included in sitemap, False otherwise @@ -903,14 +903,14 @@ def should_include_in_sitemap(self, record): # Extract values from record dictionary bibcode = record.get('bibcode', None) - bib_data = record.get('bib_data', None) + has_bib_data = record.get('has_bib_data', None) bib_data_updated = record.get('bib_data_updated') solr_processed = record.get('solr_processed') status = record.get('status') # Must have bibliographic data - if not bib_data or not bibcode or (isinstance(bib_data, str) and not bib_data.strip()): - self.logger.debug('Excluding %s from sitemap: No bibcode or bib_data', bibcode) + if not has_bib_data or not bibcode: + self.logger.debug('Excluding %s from sitemap: No bibcode or has_bib_data is False', bibcode) return False # Exclude if SOLR failed or if record is being retried (previously failed) @@ -959,6 +959,8 @@ def get_records_bulk(self, bibcodes, session, load_only=None): record_data = {} for field in (load_only or ['id', 'bibcode', 'bib_data', 'bib_data_updated', 'solr_processed', 'status']): record_data[field] = getattr(record, field, None) + # Add has_bib_data boolean for sitemap checks + record_data['has_bib_data'] = bool(record_data.get('bib_data')) records_dict[record.bibcode] = record_data return records_dict diff --git a/adsmp/tasks.py b/adsmp/tasks.py index 7620cc9..18cb1cf 100644 --- a/adsmp/tasks.py +++ b/adsmp/tasks.py @@ -428,7 +428,7 @@ def task_cleanup_invalid_sitemaps(): session.query( SitemapInfo.id, SitemapInfo.bibcode, - Records.bib_data, + (Records.bib_data.isnot(None)).label('has_bib_data'), Records.bib_data_updated, Records.solr_processed, Records.status @@ -457,7 +457,7 @@ def task_cleanup_invalid_sitemaps(): # Convert to dict for should_include_in_sitemap function record_dict = { 'bibcode': record_data.bibcode, - 'bib_data': record_data.bib_data, + 'has_bib_data': record_data.has_bib_data, 'bib_data_updated': record_data.bib_data_updated, 'solr_processed': record_data.solr_processed, 'status': record_data.status @@ -626,7 +626,7 @@ def task_manage_sitemap(bibcodes, action): # Apply SOLR filtering - convert record to dict for should_include_in_sitemap record_dict = { 'bibcode': record.bibcode, - 'bib_data': record.bib_data, + 'has_bib_data': bool(record.bib_data), 'bib_data_updated': record.bib_data_updated, 'solr_processed': record.solr_processed, 'status': record.status diff --git a/adsmp/tests/test_app.py b/adsmp/tests/test_app.py index 9c4bb21..fabf6e3 100644 --- a/adsmp/tests/test_app.py +++ b/adsmp/tests/test_app.py @@ -471,7 +471,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 1: Record with no bib_data (should be excluded) record_no_data = { 'bibcode': '2023NoData..1..1A', - 'bib_data': None, + 'has_bib_data': False, 'status': 'success' } self.assertFalse(self.app.should_include_in_sitemap(record_no_data), @@ -480,7 +480,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 2: Record with empty bib_data string (should be excluded) record_empty_data = { 'bibcode': '2023Empty..1..1A', - 'bib_data': '', + 'has_bib_data': False, 'status': 'success' } self.assertFalse(self.app.should_include_in_sitemap(record_empty_data), @@ -489,7 +489,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 3: Record with solr-failed status (should be excluded) record_solr_failed = { 'bibcode': '2023Failed..1..1A', - 'bib_data': '{"title": "Test"}', + 'has_bib_data': True, 'status': 'solr-failed' } self.assertFalse(self.app.should_include_in_sitemap(record_solr_failed), @@ -498,7 +498,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 4: Record with retrying status (should be excluded) record_retrying = { 'bibcode': '2023Retrying..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'retrying' } self.assertFalse(self.app.should_include_in_sitemap(record_retrying), @@ -507,7 +507,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 5: Record with None status (should be included) record_none_status = { 'bibcode': '2023NoneStatus..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': None } self.assertTrue(self.app.should_include_in_sitemap(record_none_status), @@ -516,7 +516,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 6: Record with success status (should be included) record_success = { 'bibcode': '2023Success..1..1A', - 'bib_data': '{"title": "Test"}', + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': base_time - timedelta(days=1) } @@ -526,7 +526,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 7: Record with metrics-failed status (should be included - not SOLR-related) record_metrics_failed = { 'bibcode': '2023MetricsFailed..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'metrics-failed' } self.assertTrue(self.app.should_include_in_sitemap(record_metrics_failed), @@ -535,7 +535,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 8: Record with links-failed status (should be included - not SOLR-related) record_links_failed = { 'bibcode': '2023LinksFailed..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'links-failed' } self.assertTrue(self.app.should_include_in_sitemap(record_links_failed), @@ -544,7 +544,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 9: Record with None status and no solr_processed (should be included - not yet processed) record_not_processed = { 'bibcode': '2023NotProcessed..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': None, 'solr_processed': None } @@ -554,7 +554,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 10: Record with recent solr_processed (should be included) record_recent_solr = { 'bibcode': '2023Recent..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': base_time - timedelta(days=1), 'solr_processed': base_time # More recent than bib_data_updated @@ -565,7 +565,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 11: Record with stale solr_processed (should be included with warning) record_stale_solr = { 'bibcode': '2023Stale..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': base_time, 'solr_processed': base_time - timedelta(days=6) # 6 days stale (> 5 day threshold) @@ -576,7 +576,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 12: Record with exactly 5+ days staleness (boundary condition) record_boundary = { 'bibcode': '2023Boundary..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': base_time, 'solr_processed': base_time - timedelta(days=5, seconds=1) # Just over 5 days @@ -587,7 +587,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 13: Record with no timestamps (should be included) record_no_timestamps = { 'bibcode': '2023NoTimestamps..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': None, 'solr_processed': None @@ -598,7 +598,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 14: Record with bib_data_updated but no solr_processed (should be included) record_no_solr_time = { 'bibcode': '2023NoSolrTime..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': base_time, 'solr_processed': None @@ -609,7 +609,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 15: Record with solr_processed but no bib_data_updated (should be included) record_no_bib_time = { 'bibcode': '2023NoBibTime..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': None, 'solr_processed': base_time @@ -620,7 +620,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 16: Record with very fresh processing (should be included) record_fresh = { 'bibcode': '2023Fresh..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': base_time - timedelta(minutes=30), 'solr_processed': base_time @@ -631,7 +631,7 @@ def test_should_include_in_sitemap_comprehensive(self): # Test 17: Record with moderate lag (2 days, should be included without warning) record_moderate_lag = { 'bibcode': '2023Moderate..1..1A', - 'bib_data': {'title': 'Test'}, + 'has_bib_data': True, 'status': 'success', 'bib_data_updated': base_time - timedelta(days=2), 'solr_processed': base_time diff --git a/adsmp/tests/test_tasks.py b/adsmp/tests/test_tasks.py index bf1072f..0f1f0c7 100644 --- a/adsmp/tests/test_tasks.py +++ b/adsmp/tests/test_tasks.py @@ -1227,8 +1227,6 @@ def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self): test_cases = [ # (bibcode, bib_data, status, description) ('2023NoData..1..1A', None, 'success', 'No bib_data'), - ('2023EmptyData..1..1B', '', 'success', 'Empty bib_data'), - ('2023EmptyData2..1..1C', ' ', 'success', 'Whitespace-only bib_data'), ('2023SolrFailed..1..1D', '{"title": "Test"}', 'solr-failed', 'SOLR failed status'), ('2023Retrying..1..1E', '{"title": "Test"}', 'retrying', 'Retrying status'), ] @@ -1297,15 +1295,15 @@ def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self): # Verify setup total_records = session.query(SitemapInfo).count() - self.assertEqual(total_records, 7, "Should have 7 records (5 invalid + 2 valid)") + self.assertEqual(total_records, 5, "Should have 5 records (3 invalid + 2 valid)") # Execute cleanup with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files: result = tasks.task_cleanup_invalid_sitemaps() - # Verify all invalid records were removed - self.assertEqual(result['invalid_removed'], 5, "Should remove all 5 invalid records") - self.assertEqual(result['total_processed'], 7, "Should process all 7 records") + # Verify invalid records were removed + self.assertEqual(result['invalid_removed'], 3, "Should remove 3 invalid records (None, solr-failed, retrying)") + self.assertEqual(result['total_processed'], 5, "Should process all 5 records") self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration") # Verify files were flagged for regeneration (1 file with mixed valid/invalid records) @@ -1313,16 +1311,18 @@ def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self): # Verify database state with self.app.session_scope() as session: - # Two valid records should remain + # Only 2 valid records should remain remaining_records = session.query(SitemapInfo).all() self.assertEqual(len(remaining_records), 2, "Should have 2 remaining valid records") remaining_bibcodes = {r.bibcode for r in remaining_records} - self.assertEqual(remaining_bibcodes, {valid_bibcode, valid_bibcode_in_mixed_file}, "Both valid records should remain") + expected_bibcodes = {valid_bibcode, valid_bibcode_in_mixed_file} + self.assertEqual(remaining_bibcodes, expected_bibcodes, "Both valid records should remain") - # All invalid records should be gone - for bibcode, _, _, description in test_cases: + # All invalid records (None bib_data, solr-failed, retrying) should be removed + removed_bibcodes = {'2023NoData..1..1A', '2023SolrFailed..1..1D', '2023Retrying..1..1E'} + for bibcode in removed_bibcodes: invalid_count = session.query(SitemapInfo).filter_by(bibcode=bibcode).count() - self.assertEqual(invalid_count, 0, f"Invalid record should be removed: {description}") + self.assertEqual(invalid_count, 0, f"Invalid record should be removed: {bibcode}") def test_delete_by_bibcode_marks_sitemap_files_for_regeneration(self): """Test that delete_by_bibcode properly marks affected sitemap files for regeneration"""