Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions adsmp/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,7 +894,7 @@ def should_include_in_sitemap(self, record):
3. If processed, processing isn't too stale

Args:
record: Dictionary with record data including bib_data, status, timestamps
record: Dictionary with record data including has_bib_data, status, timestamps

Returns:
bool: True if record should be included in sitemap, False otherwise
Expand All @@ -903,14 +903,14 @@ def should_include_in_sitemap(self, record):

# Extract values from record dictionary
bibcode = record.get('bibcode', None)
bib_data = record.get('bib_data', None)
has_bib_data = record.get('has_bib_data', None)
bib_data_updated = record.get('bib_data_updated')
solr_processed = record.get('solr_processed')
status = record.get('status')

# Must have bibliographic data
if not bib_data or not bibcode or (isinstance(bib_data, str) and not bib_data.strip()):
self.logger.debug('Excluding %s from sitemap: No bibcode or bib_data', bibcode)
if not has_bib_data or not bibcode:
self.logger.debug('Excluding %s from sitemap: No bibcode or has_bib_data is False', bibcode)
return False

# Exclude if SOLR failed or if record is being retried (previously failed)
Expand Down Expand Up @@ -959,6 +959,8 @@ def get_records_bulk(self, bibcodes, session, load_only=None):
record_data = {}
for field in (load_only or ['id', 'bibcode', 'bib_data', 'bib_data_updated', 'solr_processed', 'status']):
record_data[field] = getattr(record, field, None)
# Add has_bib_data boolean for sitemap checks
record_data['has_bib_data'] = bool(record_data.get('bib_data'))
records_dict[record.bibcode] = record_data

return records_dict
Expand Down
6 changes: 3 additions & 3 deletions adsmp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def task_cleanup_invalid_sitemaps():
session.query(
SitemapInfo.id,
SitemapInfo.bibcode,
Records.bib_data,
(Records.bib_data.isnot(None)).label('has_bib_data'),
Records.bib_data_updated,
Records.solr_processed,
Records.status
Expand Down Expand Up @@ -457,7 +457,7 @@ def task_cleanup_invalid_sitemaps():
# Convert to dict for should_include_in_sitemap function
record_dict = {
'bibcode': record_data.bibcode,
'bib_data': record_data.bib_data,
'has_bib_data': record_data.has_bib_data,
'bib_data_updated': record_data.bib_data_updated,
'solr_processed': record_data.solr_processed,
'status': record_data.status
Expand Down Expand Up @@ -626,7 +626,7 @@ def task_manage_sitemap(bibcodes, action):
# Apply SOLR filtering - convert record to dict for should_include_in_sitemap
record_dict = {
'bibcode': record.bibcode,
'bib_data': record.bib_data,
'has_bib_data': bool(record.bib_data),
'bib_data_updated': record.bib_data_updated,
'solr_processed': record.solr_processed,
'status': record.status
Expand Down
34 changes: 17 additions & 17 deletions adsmp/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 1: Record with no bib_data (should be excluded)
record_no_data = {
'bibcode': '2023NoData..1..1A',
'bib_data': None,
'has_bib_data': False,
'status': 'success'
}
self.assertFalse(self.app.should_include_in_sitemap(record_no_data),
Expand All @@ -480,7 +480,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 2: Record with empty bib_data string (should be excluded)
record_empty_data = {
'bibcode': '2023Empty..1..1A',
'bib_data': '',
'has_bib_data': False,
'status': 'success'
}
self.assertFalse(self.app.should_include_in_sitemap(record_empty_data),
Expand All @@ -489,7 +489,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 3: Record with solr-failed status (should be excluded)
record_solr_failed = {
'bibcode': '2023Failed..1..1A',
'bib_data': '{"title": "Test"}',
'has_bib_data': True,
'status': 'solr-failed'
}
self.assertFalse(self.app.should_include_in_sitemap(record_solr_failed),
Expand All @@ -498,7 +498,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 4: Record with retrying status (should be excluded)
record_retrying = {
'bibcode': '2023Retrying..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'retrying'
}
self.assertFalse(self.app.should_include_in_sitemap(record_retrying),
Expand All @@ -507,7 +507,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 5: Record with None status (should be included)
record_none_status = {
'bibcode': '2023NoneStatus..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': None
}
self.assertTrue(self.app.should_include_in_sitemap(record_none_status),
Expand All @@ -516,7 +516,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 6: Record with success status (should be included)
record_success = {
'bibcode': '2023Success..1..1A',
'bib_data': '{"title": "Test"}',
'has_bib_data': True,
'status': 'success',
'bib_data_updated': base_time - timedelta(days=1)
}
Expand All @@ -526,7 +526,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 7: Record with metrics-failed status (should be included - not SOLR-related)
record_metrics_failed = {
'bibcode': '2023MetricsFailed..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'metrics-failed'
}
self.assertTrue(self.app.should_include_in_sitemap(record_metrics_failed),
Expand All @@ -535,7 +535,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 8: Record with links-failed status (should be included - not SOLR-related)
record_links_failed = {
'bibcode': '2023LinksFailed..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'links-failed'
}
self.assertTrue(self.app.should_include_in_sitemap(record_links_failed),
Expand All @@ -544,7 +544,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 9: Record with None status and no solr_processed (should be included - not yet processed)
record_not_processed = {
'bibcode': '2023NotProcessed..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': None,
'solr_processed': None
}
Expand All @@ -554,7 +554,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 10: Record with recent solr_processed (should be included)
record_recent_solr = {
'bibcode': '2023Recent..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'success',
'bib_data_updated': base_time - timedelta(days=1),
'solr_processed': base_time # More recent than bib_data_updated
Expand All @@ -565,7 +565,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 11: Record with stale solr_processed (should be included with warning)
record_stale_solr = {
'bibcode': '2023Stale..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'success',
'bib_data_updated': base_time,
'solr_processed': base_time - timedelta(days=6) # 6 days stale (> 5 day threshold)
Expand All @@ -576,7 +576,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 12: Record with exactly 5+ days staleness (boundary condition)
record_boundary = {
'bibcode': '2023Boundary..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'success',
'bib_data_updated': base_time,
'solr_processed': base_time - timedelta(days=5, seconds=1) # Just over 5 days
Expand All @@ -587,7 +587,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 13: Record with no timestamps (should be included)
record_no_timestamps = {
'bibcode': '2023NoTimestamps..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'success',
'bib_data_updated': None,
'solr_processed': None
Expand All @@ -598,7 +598,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 14: Record with bib_data_updated but no solr_processed (should be included)
record_no_solr_time = {
'bibcode': '2023NoSolrTime..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'success',
'bib_data_updated': base_time,
'solr_processed': None
Expand All @@ -609,7 +609,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 15: Record with solr_processed but no bib_data_updated (should be included)
record_no_bib_time = {
'bibcode': '2023NoBibTime..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'success',
'bib_data_updated': None,
'solr_processed': base_time
Expand All @@ -620,7 +620,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 16: Record with very fresh processing (should be included)
record_fresh = {
'bibcode': '2023Fresh..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'success',
'bib_data_updated': base_time - timedelta(minutes=30),
'solr_processed': base_time
Expand All @@ -631,7 +631,7 @@ def test_should_include_in_sitemap_comprehensive(self):
# Test 17: Record with moderate lag (2 days, should be included without warning)
record_moderate_lag = {
'bibcode': '2023Moderate..1..1A',
'bib_data': {'title': 'Test'},
'has_bib_data': True,
'status': 'success',
'bib_data_updated': base_time - timedelta(days=2),
'solr_processed': base_time
Expand Down
22 changes: 11 additions & 11 deletions adsmp/tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,8 +1227,6 @@ def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self):
test_cases = [
# (bibcode, bib_data, status, description)
('2023NoData..1..1A', None, 'success', 'No bib_data'),
('2023EmptyData..1..1B', '', 'success', 'Empty bib_data'),
('2023EmptyData2..1..1C', ' ', 'success', 'Whitespace-only bib_data'),
('2023SolrFailed..1..1D', '{"title": "Test"}', 'solr-failed', 'SOLR failed status'),
('2023Retrying..1..1E', '{"title": "Test"}', 'retrying', 'Retrying status'),
]
Expand Down Expand Up @@ -1297,32 +1295,34 @@ def test_task_cleanup_invalid_sitemaps_comprehensive_invalid_cases(self):

# Verify setup
total_records = session.query(SitemapInfo).count()
self.assertEqual(total_records, 7, "Should have 7 records (5 invalid + 2 valid)")
self.assertEqual(total_records, 5, "Should have 5 records (3 invalid + 2 valid)")

# Execute cleanup
with patch.object(self.app, 'delete_sitemap_files') as mock_delete_files:
result = tasks.task_cleanup_invalid_sitemaps()

# Verify all invalid records were removed
self.assertEqual(result['invalid_removed'], 5, "Should remove all 5 invalid records")
self.assertEqual(result['total_processed'], 7, "Should process all 7 records")
# Verify invalid records were removed
self.assertEqual(result['invalid_removed'], 3, "Should remove 3 invalid records (None, solr-failed, retrying)")
self.assertEqual(result['total_processed'], 5, "Should process all 5 records")
self.assertTrue(result['files_regenerated'], "Should indicate files need regeneration")

# Verify files were flagged for regeneration (1 file with mixed valid/invalid records)
self.assertGreaterEqual(result['files_flagged'], 1, "Should have flagged at least 1 file")

# Verify database state
with self.app.session_scope() as session:
# Two valid records should remain
# Only 2 valid records should remain
remaining_records = session.query(SitemapInfo).all()
self.assertEqual(len(remaining_records), 2, "Should have 2 remaining valid records")
remaining_bibcodes = {r.bibcode for r in remaining_records}
self.assertEqual(remaining_bibcodes, {valid_bibcode, valid_bibcode_in_mixed_file}, "Both valid records should remain")
expected_bibcodes = {valid_bibcode, valid_bibcode_in_mixed_file}
self.assertEqual(remaining_bibcodes, expected_bibcodes, "Both valid records should remain")

# All invalid records should be gone
for bibcode, _, _, description in test_cases:
# All invalid records (None bib_data, solr-failed, retrying) should be removed
removed_bibcodes = {'2023NoData..1..1A', '2023SolrFailed..1..1D', '2023Retrying..1..1E'}
for bibcode in removed_bibcodes:
invalid_count = session.query(SitemapInfo).filter_by(bibcode=bibcode).count()
self.assertEqual(invalid_count, 0, f"Invalid record should be removed: {description}")
self.assertEqual(invalid_count, 0, f"Invalid record should be removed: {bibcode}")

def test_delete_by_bibcode_marks_sitemap_files_for_regeneration(self):
"""Test that delete_by_bibcode properly marks affected sitemap files for regeneration"""
Expand Down