Merge pull request #42 from microsoft/adesousa_microsoft/support-cita…

…tions data scripts update
microsoft · Aug 13, 2024 · 564195c · 564195c
2 parents b00543a + 46fde79
commit 564195c
Show file tree

Hide file tree

Showing 17 changed files with 324 additions and 258 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,7 @@ __pycache__/
 .ipynb_checkpoints/
 
 data
-static
+static
+
+scripts/config.json
+venv
diff --git a/app.py b/app.py
@@ -163,7 +163,6 @@ def init_ai_search_client():
         endpoint = app_settings.datasource.endpoint
         key_credential = app_settings.datasource.key
         index_name = app_settings.datasource.index
-
         client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(key_credential))
         return client
     except Exception as e:
@@ -889,10 +888,10 @@ async def generate_section_content():
         logging.exception("Exception in /history/clear_messages")
         return jsonify({"error": str(e)}), 500
 
-@bp.route("/document/<documentId>")
-async def get_document(documentId):
+@bp.route("/document/<filepath>")
+async def get_document(filepath):
     try:
-        document = retrieve_document(documentId)
+        document = retrieve_document(filepath)
         return jsonify(document), 200
     except Exception as e:
         logging.exception("Exception in /history/clear_messages")
@@ -1006,10 +1005,15 @@ async def generate_section_content(request_json):
     except Exception as e:
         raise e
 
-def retrieve_document(id):
+def retrieve_document(filepath):
     try:
         search_client = init_ai_search_client()
-        document = search_client.get_document(id)
+        search_query = f"filepath eq '{filepath}'"
+        # Execute the search query
+        results = search_client.search(search_query)
+
+        # Get the full_content of the first result
+        document = next(results)
         return document
     except Exception as e:
         logging.exception("Exception in retrieve_document")

diff --git a/frontend/src/pages/chat/Chat.tsx b/frontend/src/pages/chat/Chat.tsx
@@ -783,7 +783,7 @@ const Chat = ({ type = ChatType.Browse }: Props) => {
   }, [showLoadingMessage, processMessages])
 
   const onShowCitation = (citation: Citation) => {
-    const path = `/#/document/${citation.id}`
+    const path = `/#/document/${citation.filepath}`;
 
     // Instead of navigating within the app, use window.open to open in a new tab
     const url = window.location.origin + path

diff --git a/frontend/src/pages/document/Document.tsx b/frontend/src/pages/document/Document.tsx
@@ -5,6 +5,7 @@ import { useParams } from 'react-router-dom';
 // Define the interface for the document data
 interface DocumentData {
   content: string;
+  full_content: string;
 }
 
 const Document = (): JSX.Element => {
@@ -19,7 +20,6 @@ const Document = (): JSX.Element => {
       try {
         const response = await documentRead(id);
         const data = await response.json();
-
         setDocument(data);
       } catch (error) {
         console.error(error);
@@ -39,7 +39,7 @@ const Document = (): JSX.Element => {
       {isLoading ? ( // Step 4
         <p>Loading...</p>
       ) : document ? (
-        <p>{document.content}</p>
+        <p>{document.full_content}</p>
       ) : (
         <h1>Document not found. Please try again.</h1>
       )}

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,5 +1,5 @@
 -r requirements.txt
-azure-ai-formrecognizer==3.2.1
+azure-ai-documentintelligence==1.0.0b2
 Markdown==3.4.4
 requests==2.32.3
 tqdm==4.66.1
@@ -9,6 +9,8 @@ bs4==0.0.1
 urllib3==2.2.2
 pytest==7.4.0
 pytest-asyncio==0.23.2
+PyMuPDF==1.24.5
 azure-storage-blob
 chardet
-azure-keyvault-secrets
+azure-keyvault-secrets
+coverage
diff --git a/requirements.txt b/requirements.txt
@@ -9,5 +9,4 @@ quart==0.19.4
 uvicorn==0.24.0
 aiohttp==3.9.2
 gunicorn==20.1.0
-pydantic-settings==2.2.1
-azure-search-documents
+pydantic-settings==2.2.1
diff --git a/scripts/auth_init.ps1 b/scripts/auth_init.ps1
diff --git a/scripts/auth_init.sh b/scripts/auth_init.sh
diff --git a/scripts/auth_update.py b/scripts/auth_update.py
diff --git a/scripts/auth_update.sh b/scripts/auth_update.sh
diff --git a/scripts/data_preparation.py b/scripts/data_preparation.py
@@ -7,7 +7,7 @@
 import time
 
 import requests
-from azure.ai.formrecognizer import DocumentAnalysisClient
+from azure.ai.documentintelligence import DocumentIntelligenceClient
 from azure.core.credentials import AzureKeyCredential
 from azure.identity import AzureCliCredential
 from azure.search.documents import SearchClient
@@ -182,6 +182,15 @@ def create_or_update_search_index(
                 "filterable": False,
                 "analyzer": f"{language}.lucene" if language else None,
             },
+            {
+                "name": "full_content",
+                "type": "Edm.String",
+                "searchable": True,
+                "sortable": False,
+                "facetable": False,
+                "filterable": False,
+                "analyzer": f"{language}.lucene" if language else None,
+            },
             {
                 "name": "title",
                 "type": "Edm.String",
@@ -209,6 +218,14 @@ def create_or_update_search_index(
                 "type": "Edm.String",
                 "searchable": True,
             },
+            {
+                "name": "image_mapping",
+                "type": "Edm.String",
+                "searchable": False,
+                "sortable": False,
+                "facetable": False,
+                "filterable": False
+            }
         ],
         "suggesters": [],
         "scoringProfiles": [],
@@ -356,7 +373,7 @@ def validate_index(service_name, subscription_id, resource_group, index_name):
                 print(f"Request failed. Please investigate. Status code: {response.status_code}")
             break
 
-def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4):
+def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4, captioning_model_endpoint=None, captioning_model_key=None):
     service_name = config["search_service_name"]
     subscription_id = config["subscription_id"]
     resource_group = config["resource_group"]
@@ -410,7 +427,8 @@ def create_index(config, credential, form_recognizer_client=None, embedding_mode
         elif os.path.exists(data_config["path"]):
             result = chunk_directory(data_config["path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
                                     azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
-                                    add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"])
+                                    add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"],
+                                    captioning_model_endpoint=captioning_model_endpoint, captioning_model_key=captioning_model_key)
         else:
             raise Exception(f"Path {data_config['path']} does not exist and is not a blob URL. Please check the path and try again.")
 
@@ -448,6 +466,8 @@ def valid_range(n):
     parser.add_argument("--embedding-model-endpoint", type=str, help="Endpoint for the embedding model to use for vector search. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<Ada deployment name>/embeddings?api-version=2024-03-01-Preview'")
     parser.add_argument("--embedding-model-key", type=str, help="Key for the embedding model to use for vector search.")
     parser.add_argument("--search-admin-key", type=str, help="Admin key for the search service. If not provided, will use Azure CLI to get the key.")
+    parser.add_argument("--azure-openai-endpoint", type=str, help="Endpoint for the (Azure) OpenAI API. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<vision model name>/chat/completions?api-version=2024-04-01-preview'")
+    parser.add_argument("--azure-openai-key", type=str, help="Key for the (Azure) OpenAI API.")
     args = parser.parse_args()
 
     with open(args.config) as f:
@@ -464,15 +484,15 @@ def valid_range(n):
         os.environ["FORM_RECOGNIZER_ENDPOINT"] = f"https://{args.form_rec_resource}.cognitiveservices.azure.com/"
         os.environ["FORM_RECOGNIZER_KEY"] = args.form_rec_key
         if args.njobs==1:
-            form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key))
+            form_recognizer_client = DocumentIntelligenceClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key))
         print(f"Using Form Recognizer resource {args.form_rec_resource} for PDF cracking, with the {'Layout' if args.form_rec_use_layout else 'Read'} model.")
 
     for index_config in config:
         print("Preparing data for index:", index_config["index_name"])
         if index_config.get("vector_config_name") and not args.embedding_model_endpoint:
             raise Exception("ERROR: Vector search is enabled in the config, but no embedding model endpoint and key were provided. Please provide these values or disable vector search.")
 
-        create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs)
+        create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs, captioning_model_endpoint=args.azure_openai_endpoint, captioning_model_key=args.azure_openai_key)
         print("Data preparation for index", index_config["index_name"], "completed")
 
     print(f"Data preparation script completed. {len(config)} indexes updated.")
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,4 +7,7 @@ __pycache__/ @@
     .ipynb_checkpoints/
     data
-    static
+    static
+    scripts/config.json
+    venv