Merge pull request #77 from arm/axion-data-addition

JoeStech · web-flow · commit a36305b67ad1 · 2026-04-14T15:15:24.000-06:00
add axion data
diff --git a/embedding-generation/eval_questions.json b/embedding-generation/eval_questions.json
@@ -76,10 +76,75 @@
     "expected_urls": ["https://amperecomputing.com/blogs/getting-cloud-native-with-freebsd-on-oci-ampere-a1-with-terraform-"]
   },
   {
-    "question": "In the AWS Graviton performance runbook, how should I define a benchmark and configure the system under test before optimization?",
+    "question": "What Google Axion-backed Compute Engine machine series are available for Arm VMs, and how do C4A and N4A differ?",
     "expected_urls": [
-      "https://github.com/aws/aws-graviton-getting-started/blob/main/perfrunbook/defining_your_benchmark.md",
-      "https://github.com/aws/aws-graviton-getting-started/blob/main/perfrunbook/configuring_your_sut.md"
+      "https://docs.cloud.google.com/compute/docs/instances/arm-on-compute"
+    ]
+  },
+  {
+    "question": "On Google Cloud CPU platforms, which Arm machine series map to Google Axion versus Ampere Altra, and how are vCPUs counted on those platforms?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/compute/docs/cpu-platforms"
+    ]
+  },
+  {
+    "question": "What storage, networking, and workload positioning does Google Cloud call out for the C4A and N4A general-purpose machine families?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/compute/docs/general-purpose-machines"
+    ]
+  },
+  {
+    "question": "How does Google Cloud's next-generation dynamic resource management improve N4A VM placement and live migration for Axion workloads?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/compute/docs/dynamic-resource-management"
+    ]
+  },
+  {
+    "question": "What C4A bare metal instance options does Compute Engine provide, including the machine type, vCPU count, memory, and network bandwidth?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/compute/docs/instances/bare-metal-instances"
+    ]
+  },
+  {
+    "question": "In Google Cloud Troubleshooting Arm VMs, what happens if you create an Arm VM from a boot disk with x86 architecture, and how do serial console logs help identify it?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/compute/docs/troubleshooting/troubleshooting-arm-vms"
+    ]
+  },
+  {
+    "question": "Which GKE cluster modes and machine families support Arm workloads, and what are the major limitations for Arm nodes on GKE?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/kubernetes-engine/docs/concepts/arm-on-gke"
+    ]
+  },
+  {
+    "question": "How do you build and verify a multi-architecture container image so the same image can run on both x86 and Arm nodes in GKE?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/kubernetes-engine/docs/how-to/build-multi-arch-for-arm"
+    ]
+  },
+  {
+    "question": "In a GKE Standard cluster, why are Arm nodes tainted by default and what selectors or tolerations are needed to schedule workloads onto them?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/kubernetes-engine/docs/how-to/prepare-arm-workloads-for-deployment"
+    ]
+  },
+  {
+    "question": "How do you request Arm nodes in GKE Autopilot, and what does kubernetes.io/arch=arm64 select on newer versus older cluster versions?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/kubernetes-engine/docs/how-to/autopilot-arm-workloads"
+    ]
+  },
+  {
+    "question": "What are the main steps in the GKE tutorial for migrating an x86-only application to a multi-arch image that also runs on Arm?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/kubernetes-engine/docs/tutorials/migrate-x86-to-multi-arch-arm"
+    ]
+  },
+  {
+    "question": "In GKE Troubleshooting Arm workloads, what does the Pod log message \"exec ./hello-app: exec format error\" mean, and what is the recommended fix?",
+    "expected_urls": [
+      "https://docs.cloud.google.com/kubernetes-engine/docs/troubleshooting/arm-workloads"
     ]
   }
 ]
diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py
@@ -163,13 +163,26 @@ def register_source(site_name, license_type, display_name, url, keywords):
         return False
     
     known_source_urls.add(url)
-    all_sources.append({
+    source_entry = {
         'site_name': site_name,
         'license_type': license_type,
         'display_name': display_name,
         'url': url,
         'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords)
-    })
+    }
+
+    # Keep discovered sources grouped with their existing site section instead of
+    # appending them to the very end of the CSV and fragmenting that block.
+    insert_at = None
+    for index, existing_source in enumerate(all_sources):
+        if existing_source.get('site_name') == site_name:
+            insert_at = index + 1
+
+    if insert_at is None:
+        all_sources.append(source_entry)
+    else:
+        all_sources.insert(insert_at, source_entry)
+
     print(f"[NEW SOURCE] {display_name}: {url}")
     return True
 
diff --git a/embedding-generation/tests/test_generate_chunks.py b/embedding-generation/tests/test_generate_chunks.py
@@ -155,6 +155,57 @@ def test_register_source_duplicate(self, gc):
         assert result is False
         assert len(gc.all_sources) == 1
 
+    def test_register_source_inserts_after_matching_site_group(self, gc):
+        """Test that new sources stay grouped with existing sources from the same site."""
+        gc.all_sources = [
+            {
+                'site_name': 'Google Cloud',
+                'license_type': 'CC4.0',
+                'display_name': 'Google 1',
+                'url': 'https://example.com/google-1',
+                'keywords': 'g1'
+            },
+            {
+                'site_name': 'Ecosystem Dashboard',
+                'license_type': 'Arm Proprietary',
+                'display_name': 'Dashboard 1',
+                'url': 'https://example.com/dashboard-1',
+                'keywords': 'd1'
+            },
+            {
+                'site_name': 'Ecosystem Dashboard',
+                'license_type': 'Arm Proprietary',
+                'display_name': 'Dashboard 2',
+                'url': 'https://example.com/dashboard-2',
+                'keywords': 'd2'
+            },
+            {
+                'site_name': 'AWS Graviton',
+                'license_type': 'Apache-2.0',
+                'display_name': 'Graviton 1',
+                'url': 'https://example.com/graviton-1',
+                'keywords': 'a1'
+            },
+        ]
+        gc.known_source_urls = {source['url'] for source in gc.all_sources}
+
+        result = gc.register_source(
+            site_name="Ecosystem Dashboard",
+            license_type="Arm Proprietary",
+            display_name="Dashboard 3",
+            url="https://example.com/dashboard-3",
+            keywords=["d3"]
+        )
+
+        assert result is True
+        assert [source['display_name'] for source in gc.all_sources] == [
+            'Google 1',
+            'Dashboard 1',
+            'Dashboard 2',
+            'Dashboard 3',
+            'Graviton 1',
+        ]
+
     def test_register_source_url_normalization(self, gc):
         """Test that URLs are stripped of whitespace."""
         gc.register_source(
diff --git a/embedding-generation/vector-db-sources.csv b/embedding-generation/vector-db-sources.csv