Skip to content

Commit

Permalink
Merge pull request #42 from microsoft/adesousa_microsoft/support-cita…
Browse files Browse the repository at this point in the history
…tions

data scripts update
  • Loading branch information
andrewldesousa authored Aug 13, 2024
2 parents b00543a + 46fde79 commit 564195c
Show file tree
Hide file tree
Showing 17 changed files with 324 additions and 258 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ __pycache__/
.ipynb_checkpoints/

data
static
static

scripts/config.json
venv
16 changes: 10 additions & 6 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ def init_ai_search_client():
endpoint = app_settings.datasource.endpoint
key_credential = app_settings.datasource.key
index_name = app_settings.datasource.index

client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(key_credential))
return client
except Exception as e:
Expand Down Expand Up @@ -889,10 +888,10 @@ async def generate_section_content():
logging.exception("Exception in /history/clear_messages")
return jsonify({"error": str(e)}), 500

@bp.route("/document/<documentId>")
async def get_document(documentId):
@bp.route("/document/<filepath>")
async def get_document(filepath):
try:
document = retrieve_document(documentId)
document = retrieve_document(filepath)
return jsonify(document), 200
except Exception as e:
logging.exception("Exception in /history/clear_messages")
Expand Down Expand Up @@ -1006,10 +1005,15 @@ async def generate_section_content(request_json):
except Exception as e:
raise e

def retrieve_document(id):
def retrieve_document(filepath):
try:
search_client = init_ai_search_client()
document = search_client.get_document(id)
search_query = f"filepath eq '{filepath}'"
# Execute the search query
results = search_client.search(search_query)

# Get the full_content of the first result
document = next(results)
return document
except Exception as e:
logging.exception("Exception in retrieve_document")
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/pages/chat/Chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ const Chat = ({ type = ChatType.Browse }: Props) => {
}, [showLoadingMessage, processMessages])

const onShowCitation = (citation: Citation) => {
const path = `/#/document/${citation.id}`
const path = `/#/document/${citation.filepath}`;

// Instead of navigating within the app, use window.open to open in a new tab
const url = window.location.origin + path
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/pages/document/Document.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { useParams } from 'react-router-dom';
// Define the interface for the document data
interface DocumentData {
content: string;
full_content: string;
}

const Document = (): JSX.Element => {
Expand All @@ -19,7 +20,6 @@ const Document = (): JSX.Element => {
try {
const response = await documentRead(id);
const data = await response.json();

setDocument(data);
} catch (error) {
console.error(error);
Expand All @@ -39,7 +39,7 @@ const Document = (): JSX.Element => {
{isLoading ? ( // Step 4
<p>Loading...</p>
) : document ? (
<p>{document.content}</p>
<p>{document.full_content}</p>
) : (
<h1>Document not found. Please try again.</h1>
)}
Expand Down
6 changes: 4 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-r requirements.txt
azure-ai-formrecognizer==3.2.1
azure-ai-documentintelligence==1.0.0b2
Markdown==3.4.4
requests==2.32.3
tqdm==4.66.1
Expand All @@ -9,6 +9,8 @@ bs4==0.0.1
urllib3==2.2.2
pytest==7.4.0
pytest-asyncio==0.23.2
PyMuPDF==1.24.5
azure-storage-blob
chardet
azure-keyvault-secrets
azure-keyvault-secrets
coverage
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@ quart==0.19.4
uvicorn==0.24.0
aiohttp==3.9.2
gunicorn==20.1.0
pydantic-settings==2.2.1
azure-search-documents
pydantic-settings==2.2.1
Empty file modified scripts/auth_init.ps1
100755 → 100644
Empty file.
Empty file modified scripts/auth_init.sh
100755 → 100644
Empty file.
Empty file modified scripts/auth_update.py
100755 → 100644
Empty file.
Empty file modified scripts/auth_update.sh
100755 → 100644
Empty file.
30 changes: 25 additions & 5 deletions scripts/data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import time

import requests
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from azure.identity import AzureCliCredential
from azure.search.documents import SearchClient
Expand Down Expand Up @@ -182,6 +182,15 @@ def create_or_update_search_index(
"filterable": False,
"analyzer": f"{language}.lucene" if language else None,
},
{
"name": "full_content",
"type": "Edm.String",
"searchable": True,
"sortable": False,
"facetable": False,
"filterable": False,
"analyzer": f"{language}.lucene" if language else None,
},
{
"name": "title",
"type": "Edm.String",
Expand Down Expand Up @@ -209,6 +218,14 @@ def create_or_update_search_index(
"type": "Edm.String",
"searchable": True,
},
{
"name": "image_mapping",
"type": "Edm.String",
"searchable": False,
"sortable": False,
"facetable": False,
"filterable": False
}
],
"suggesters": [],
"scoringProfiles": [],
Expand Down Expand Up @@ -356,7 +373,7 @@ def validate_index(service_name, subscription_id, resource_group, index_name):
print(f"Request failed. Please investigate. Status code: {response.status_code}")
break

def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4):
def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4, captioning_model_endpoint=None, captioning_model_key=None):
service_name = config["search_service_name"]
subscription_id = config["subscription_id"]
resource_group = config["resource_group"]
Expand Down Expand Up @@ -410,7 +427,8 @@ def create_index(config, credential, form_recognizer_client=None, embedding_mode
elif os.path.exists(data_config["path"]):
result = chunk_directory(data_config["path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"])
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"],
captioning_model_endpoint=captioning_model_endpoint, captioning_model_key=captioning_model_key)
else:
raise Exception(f"Path {data_config['path']} does not exist and is not a blob URL. Please check the path and try again.")

Expand Down Expand Up @@ -448,6 +466,8 @@ def valid_range(n):
parser.add_argument("--embedding-model-endpoint", type=str, help="Endpoint for the embedding model to use for vector search. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<Ada deployment name>/embeddings?api-version=2024-03-01-Preview'")
parser.add_argument("--embedding-model-key", type=str, help="Key for the embedding model to use for vector search.")
parser.add_argument("--search-admin-key", type=str, help="Admin key for the search service. If not provided, will use Azure CLI to get the key.")
parser.add_argument("--azure-openai-endpoint", type=str, help="Endpoint for the (Azure) OpenAI API. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<vision model name>/chat/completions?api-version=2024-04-01-preview'")
parser.add_argument("--azure-openai-key", type=str, help="Key for the (Azure) OpenAI API.")
args = parser.parse_args()

with open(args.config) as f:
Expand All @@ -464,15 +484,15 @@ def valid_range(n):
os.environ["FORM_RECOGNIZER_ENDPOINT"] = f"https://{args.form_rec_resource}.cognitiveservices.azure.com/"
os.environ["FORM_RECOGNIZER_KEY"] = args.form_rec_key
if args.njobs==1:
form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key))
form_recognizer_client = DocumentIntelligenceClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key))
print(f"Using Form Recognizer resource {args.form_rec_resource} for PDF cracking, with the {'Layout' if args.form_rec_use_layout else 'Read'} model.")

for index_config in config:
print("Preparing data for index:", index_config["index_name"])
if index_config.get("vector_config_name") and not args.embedding_model_endpoint:
raise Exception("ERROR: Vector search is enabled in the config, but no embedding model endpoint and key were provided. Please provide these values or disable vector search.")

create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs)
create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs, captioning_model_endpoint=args.azure_openai_endpoint, captioning_model_key=args.azure_openai_key)
print("Data preparation for index", index_config["index_name"], "completed")

print(f"Data preparation script completed. {len(config)} indexes updated.")
Loading

0 comments on commit 564195c

Please sign in to comment.