-
Notifications
You must be signed in to change notification settings - Fork 125
/
Copy pathes_retrieve.py
58 lines (47 loc) · 1.75 KB
/
es_retrieve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from elasticsearch import Elasticsearch
import random
#### Connect from host machine, using the port exposed by Docker
es = Elasticsearch(["http://localhost:9200"])
print(es.cluster.health())
#### List all indices
indices = es.indices.get(index='*')
for index_name in indices:
print(index_name)
#### With details
indices_info = es.cat.indices(format='json')
for index in indices_info:
print(f"{index['index']} doc count: {index['docs.count']}")
#### Print fields in index.
# Indexes from this example are `image_embeddings` and `text_embeddings`.
INDEX_NAME = "text_embeddings"
mapping = es.indices.get_mapping(index=INDEX_NAME)
index_name = list(mapping.body.keys())[0]
fields = mapping.body[index_name]['mappings']['properties'].keys()
print("Fields in index:")
for field in fields:
print(field)
#### Sanity test.
# A random text embedding vector to test the pipeline. In production you would have to call the same model as the workflow to compute the embedding.
# Text embedding as per the workflow has a size of 768. Change the seed or the uniform generator for different results.
#random.seed(42)
random.seed(4)
query_vec = [random.uniform(0.8, 1.) for _ in range(768)]
#query_vec = [random.uniform(0.5, .6) for _ in range(768)]
# Query the documents with knn. Change k and num_candidates (k <= num_candidates)
QUERY_FIELD_TEXT_INDEX = "embedding"
response = es.search(
index=INDEX_NAME,
body={
"knn": {
"field": QUERY_FIELD_TEXT_INDEX,
"query_vector": query_vec,
"k": 2,
"num_candidates": 3
}
}
)
#### Print document structure
print("\n KNN Documents:")
for hit in response['hits']['hits']:
print(f"Document ID: {hit['_id']}, Score: {hit['_score']}")
print(hit['_source']['chunk'])