Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Translation #390

Open
wants to merge 161 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
161 commits
Select commit Hold shift + click to select a range
93ab4a6
update UI
blockchainworld Sep 23, 2024
d68ca91
update UI and predefined message
blockchainworld Sep 24, 2024
1a2cdd6
add external search
blockchainworld Sep 25, 2024
1a9716a
fix bug in adding external search
blockchainworld Sep 25, 2024
da17e51
fix bug 2 in adding external search
blockchainworld Sep 25, 2024
4001490
rollback search engine function, update prompt
blockchainworld Sep 25, 2024
4a622c4
Update graph.py
blockchainworld Sep 25, 2024
b403374
Update constants.py
blockchainworld Sep 26, 2024
d7c0bfb
Create stock_utils.py
blockchainworld Sep 26, 2024
9c7e40c
Update graph.py
blockchainworld Sep 26, 2024
e5a25f4
Update stock_utils.py
blockchainworld Sep 26, 2024
ac067b5
Update stock_utils.py
blockchainworld Sep 26, 2024
fd60070
Update ingest.py
xwjisfather Oct 9, 2024
c188b43
Update ingest.py
xwjisfather Oct 9, 2024
6fa79a9
Update stock_utils.py
blockchainworld Oct 9, 2024
7bc3279
Update ingest.py
xwjisfather Oct 9, 2024
0c4ddb2
Update ingest.py
xwjisfather Oct 9, 2024
6b96bd9
Update ingest.py
xwjisfather Oct 9, 2024
ba898ee
Update ingest.py
xwjisfather Oct 9, 2024
91afbf9
Update ingest.py
xwjisfather Oct 9, 2024
5c120fc
Update ingest.py
xwjisfather Oct 9, 2024
e8099d7
Update ingest.py
xwjisfather Oct 9, 2024
912fde4
Update ingest.py
xwjisfather Oct 9, 2024
7e0f058
Update ingest.py
xwjisfather Oct 9, 2024
8b2608f
Update ingest.py
xwjisfather Oct 9, 2024
cfdd683
Update ingest.py
xwjisfather Oct 9, 2024
98d0cd4
Update ingest.py
xwjisfather Oct 9, 2024
0f6c783
Update ingest.py
xwjisfather Oct 9, 2024
ac8b8e6
Update ingest.py
xwjisfather Oct 9, 2024
87c7323
Update ingest.py
xwjisfather Oct 9, 2024
847a6b2
Update ingest.py
xwjisfather Oct 9, 2024
f9bb06a
Update ingest.py
xwjisfather Oct 9, 2024
c00be94
Update ingest.py
xwjisfather Oct 9, 2024
00473d7
Update ingest.py
xwjisfather Oct 10, 2024
1ab7393
Update ingest.py
xwjisfather Oct 10, 2024
6f4f34e
Update ingest.py
xwjisfather Oct 10, 2024
aa42415
Update ingest.py
xwjisfather Oct 11, 2024
4f5aa64
Update ingest.py
xwjisfather Oct 11, 2024
5aa53ed
Update ingest.py
xwjisfather Oct 11, 2024
c7da57e
Update ingest.py
xwjisfather Oct 11, 2024
0baac42
Update ingest.py
xwjisfather Oct 11, 2024
2f22b38
Update ingest.py
xwjisfather Oct 11, 2024
c7c9ea9
Update ingest.py
xwjisfather Oct 11, 2024
86bc45f
Update ChatWindow.tsx to make it responsive and mobile friendly
blockchainworld Oct 12, 2024
e251ea5
Update ChatWindow.tsx for better layout and user experience
blockchainworld Oct 12, 2024
a4764cb
Update ChatWindow.tsx
blockchainworld Oct 12, 2024
90d7a9f
Update ChatWindow.tsx
blockchainworld Oct 12, 2024
94e9837
Add files via upload
blockchainworld Oct 12, 2024
0c05a5f
Update EmptyState.tsx with new recommended questions
blockchainworld Oct 12, 2024
ccdd292
Update ChatWindow.tsx with stock price panel
blockchainworld Oct 13, 2024
ffe2ea7
Update ChatWindow.tsx add with type definition to fix typescript error
blockchainworld Oct 13, 2024
87b019f
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
0cd3e6d
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
c6abf4d
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
de4811e
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
82d8fe3
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
bd8863c
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
2ef42b5
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
e00f60e
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
066d717
Update ChatWindow.tsx
xwjisfather Oct 14, 2024
a09ad66
Update ChatWindow.tsx with defining symbol type in line 142
blockchainworld Oct 14, 2024
a22eee9
Update ChatWindow.tsx with more type definition
blockchainworld Oct 14, 2024
b457e92
Update ChatWindow.tsx with price type
blockchainworld Oct 14, 2024
f7b7c69
Update ChatWindow.tsx
blockchainworld Oct 14, 2024
3ec21c1
Update ChatWindow.tsx
blockchainworld Oct 14, 2024
110d81b
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
b9a7f1a
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
3605c72
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
1b90938
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
44318e5
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
c496846
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
37d3351
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
6fb7760
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
d38106c
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
8ef9ded
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
fc6bffa
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
2aa1132
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
61e8a9c
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
9dcb048
Update route.ts
xwjisfather Oct 15, 2024
c7123cf
Update route.ts
xwjisfather Oct 15, 2024
95dc268
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
348e92a
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
c305c9a
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
2713a16
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
593a14a
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
9d079c8
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
e3af9c8
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
693b036
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
83c06e1
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
0d9f786
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
9066800
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
5d2dac0
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
4b033d2
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
ab21147
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
4875bb1
Update ChatWindow.tsx
xwjisfather Oct 15, 2024
13ac966
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
c731da1
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
df37205
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
e29af73
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
a9e8db5
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
94dff78
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
8d8f4cb
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
40760ed
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
efb4523
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
d8b9ba0
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
2d3181e
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
3aef32c
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
4413450
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
5255a37
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
c813155
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
3955087
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
77b85a7
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
9110dbe
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
df867ce
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
6c19a7f
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
139ac21
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
a5058ff
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
78d1b80
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
f732287
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
25d5f2e
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
de42561
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
e78c325
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
ddb2595
Update ChatWindow.tsx
xwjisfather Oct 17, 2024
224d381
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
ad370c3
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
ed55f4b
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
0dd9b52
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
f6864dd
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
f045c91
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
5e72fb1
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
7993d77
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
dc7deec
Update layout.tsx
blockchainworld Oct 18, 2024
a8507cf
Create AboutUs.tsx
blockchainworld Oct 18, 2024
eefc17c
Update ChatList.tsx
blockchainworld Oct 18, 2024
7c85463
Create PricingPlan.tsx
blockchainworld Oct 18, 2024
1fb19fa
Create RegisterForm.tsx
blockchainworld Oct 18, 2024
4bbc968
Create RichMasterFunds.tsx
blockchainworld Oct 18, 2024
84a6c14
Update ChatWindow.tsx
blockchainworld Oct 18, 2024
87a38bc
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
3d5c496
Update ChatWindow.tsx
blockchainworld Oct 18, 2024
2db4a62
Update EmptyState.tsx
blockchainworld Oct 18, 2024
c67b07b
Update EmptyState.tsx
blockchainworld Oct 18, 2024
2269a1c
Update ChatWindow.tsx
xwjisfather Oct 18, 2024
e7b6d98
Update ChatList.tsx
blockchainworld Oct 19, 2024
8547e17
Update ChatWindow.tsx
blockchainworld Oct 19, 2024
a8e1a66
Update ChatWindow.tsx
blockchainworld Oct 19, 2024
57be6cf
Update ChatWindow.tsx
blockchainworld Oct 19, 2024
4e43958
Update ChatWindow.tsx
blockchainworld Oct 19, 2024
f18b373
Update ChatWindow.tsx
blockchainworld Oct 19, 2024
311d2c8
Update ChatWindow.tsx
blockchainworld Oct 19, 2024
4b312d3
Update page.tsx
blockchainworld Oct 21, 2024
b86b7e4
Update page.tsx
blockchainworld Oct 21, 2024
2533d98
Update graph.py
blockchainworld Oct 21, 2024
14c00af
Update graph.py
blockchainworld Oct 21, 2024
c649470
Translation
xwjisfather Oct 22, 2024
e7136fa
Update package.json
xwjisfather Oct 22, 2024
2afe11e
Update layout.tsx
xwjisfather Oct 22, 2024
d9b8ab4
Update ChatList.tsx
xwjisfather Oct 22, 2024
67ace8d
Update ChatWindow.tsx
xwjisfather Oct 22, 2024
d024f35
Update ChatList.tsx
xwjisfather Oct 22, 2024
fa63956
Update ChatList.tsx
xwjisfather Oct 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitpod.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# This configuration file was automatically generated by Gitpod.
# Please adjust to your needs (see https://www.gitpod.io/docs/introduction/learn-gitpod/gitpod-yaml)
# and commit this file to your remote git repository to share the goodness with others.

# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart

tasks:
- init: make


1 change: 1 addition & 0 deletions backend/constants.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
WEAVIATE_DOCS_INDEX_NAME = "LangChain_Combined_Docs_OpenAI_text_embedding_3_small"
STOCK_API_URL="https://yfinance-fza5dthrg6dxd2c3.southeastasia-01.azurewebsites.net/"
100 changes: 74 additions & 26 deletions backend/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,38 +32,31 @@

from backend.constants import WEAVIATE_DOCS_INDEX_NAME
from backend.ingest import get_embeddings_model
from backend.stock_utils import extract_and_fetch_stock_data, format_stock_info

RESPONSE_TEMPLATE = """\
You are an expert programmer and problem-solver, tasked with answering any question \
about Langchain.

Generate a comprehensive and informative answer of 80 words or less for the \
given question based solely on the provided search results (URL and content). You must \
only use information from the provided search results. Use an unbiased and \
journalistic tone. Combine search results together into a coherent answer. Do not \
repeat text. Cite search results using [${{number}}] notation. Only cite the most \
relevant results that answer the question accurately. Place these citations at the end \
of the sentence or paragraph that reference them - do not put them all at the end. If \
different results refer to different entities within the same name, write separate \
answers for each entity.
RESPONSE_TEMPLATE = """
You are an expert in stocks, finance, and cryptocurrencies, tasked with answering any question related to these domains. You can communicate fluently in both English and Chinese.

You should use bullet points in your answer for readability. Put citations where they apply
rather than putting them all at the end.
Generate a comprehensive and informative answer of 500 words or less for the given question based solely on the provided search results (URL and content). You must only use information from the provided search results. Use an unbiased and journalistic tone. Combine search results together into a coherent answer. Do not repeat text. Cite search results using [${{number}}] notation. Only cite the most relevant results that answer the question accurately. Place these citations at the end of the sentence or paragraph that reference them - do not put them all at the end. If different results refer to different entities within the same name, write separate answers for each entity.

If there is nothing in the context relevant to the question at hand, just say "Hmm, \
I'm not sure." Don't try to make up an answer.
You should use bullet points in your answer for readability. Put citations where they apply rather than putting them all at the end.

If there is nothing in the context relevant to the question at hand, try your best to create an answer based on your own knowledge as a stock, finance, and crypto expert, but add a note at the end stating: "Note: This response is based on the AI's own knowledge as a stock, finance, and crypto expert, as no relevant information was found in the provided context."

Anything between the following `context` html blocks is retrieved from a knowledge \
bank, not part of the conversation with the user.
If the input question is in Chinese, respond in Chinese. If the input question is in English, respond in English.

Anything between the following context html blocks is retrieved from a knowledge bank, not part of the conversation with the user.

<context>
{context}

{context}

<context/>

REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
not sure." Don't try to make up an answer. Anything between the preceding 'context' \
html blocks is retrieved from a knowledge bank, not part of the conversation with the \
user.\
REMEMBER: If there is no relevant information within the context, create an answer based on your own knowledge as a stock, finance, and crypto expert and include the note about the source of the information. Anything between the preceding 'context' html blocks is retrieved from a knowledge bank, not part of the conversation with the user.

如果输入的问题是中文,请用中文回答。如果输入的问题是英文,请用英文回答。
"""

COHERE_RESPONSE_TEMPLATE = """\
Expand Down Expand Up @@ -137,6 +130,7 @@ class AgentState(TypedDict):
# for convenience in evaluations
answer: str
feedback_urls: dict[str, list[str]]
stock_data: Optional[list[dict]] # New field


gpt_4o_mini = ChatOpenAI(model="gpt-4o-mini-2024-07-18", temperature=0, streaming=True)
Expand Down Expand Up @@ -204,6 +198,17 @@ class AgentState(TypedDict):
)


#for realtime stock info section
def check_stock_symbols(state: AgentState) -> AgentState:
messages = convert_to_messages(state["messages"])
query = messages[-1].content
stock_data = extract_and_fetch_stock_data(query)
if stock_data:
state["stock_data"] = stock_data
return state



@contextlib.contextmanager
def get_retriever(k: Optional[int] = None) -> Iterator[BaseRetriever]:
with weaviate.connect_to_weaviate_cloud(
Expand Down Expand Up @@ -305,7 +310,8 @@ def get_feedback_urls(config: RunnableConfig) -> dict[str, list[str]]:
return key_to_token_urls


def synthesize_response(

def synthesize_response_old(
state: AgentState,
config: RunnableConfig,
model: LanguageModelLike,
Expand Down Expand Up @@ -338,6 +344,44 @@ def synthesize_response(
"feedback_urls": feedback_urls,
}

def synthesize_response(
state: AgentState,
config: RunnableConfig,
model: LanguageModelLike,
prompt_template: str,
) -> AgentState:
prompt = ChatPromptTemplate.from_messages(
[
("system", prompt_template),
("placeholder", "{chat_history}"),
("human", "{question}"),
]
)
response_synthesizer = prompt | model

# Include stock data in the context if available
context = format_docs(state["documents"])
if "stock_data" in state and state["stock_data"]:
stock_info = format_stock_info(state["stock_data"])
context = stock_info + "\n" + context

synthesized_response = response_synthesizer.invoke(
{
"question": state["query"],
"context": context,
"chat_history": get_chat_history(
convert_to_messages(state["messages"][:-1])
),
}
)
feedback_urls = get_feedback_urls(config)
return {
"messages": [synthesized_response],
"answer": synthesized_response.content,
"feedback_urls": feedback_urls,
}



def synthesize_response_default(
state: AgentState, config: RunnableConfig
Expand Down Expand Up @@ -372,13 +416,17 @@ class InputSchema(TypedDict):
workflow = StateGraph(AgentState, Configuration, input=InputSchema)

# define nodes
workflow.add_node("stock_symbol_check", check_stock_symbols)
workflow.add_node("retriever", retrieve_documents)
workflow.add_node("retriever_with_chat_history", retrieve_documents_with_chat_history)
workflow.add_node("response_synthesizer", synthesize_response_default)
workflow.add_node("response_synthesizer_cohere", synthesize_response_cohere)

# set entry point to retrievers
workflow.set_conditional_entry_point(route_to_retriever)
# set entry point to stock symbol check
workflow.set_entry_point("stock_symbol_check")

# connect stock symbol check to retrievers
workflow.add_conditional_edges("stock_symbol_check", route_to_retriever)

# connect retrievers and response synthesizers
workflow.add_conditional_edges("retriever", route_to_response_synthesizer)
Expand Down
126 changes: 126 additions & 0 deletions backend/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import logging
import os
import re
import requests
from typing import Optional
from lxml import etree

import weaviate
from bs4 import BeautifulSoup, SoupStrainer
Expand Down Expand Up @@ -122,7 +124,128 @@ def load_api_docs():
),
).load()

# def extract_items_from_rss(feed_url: str):
# response = requests.get(feed_url)
# soup = BeautifulSoup(response.content, "xml")
# items = soup.find_all("item")
# news_items = []

# for item in items:
# title = item.find("title").get_text() if item.find("title") else "No Title"
# link = item.find("link").get_text() if item.find("link") else "No Link"
# identifier = item.find("dc:identifier").get_text() if item.find("dc:identifier") else "No Identifier"
# pub_date = item.find("pubDate").get_text() if item.find("pubDate") else "No Date"
# creator = item.find("dc:creator").get_text() if item.find("dc:creator") else "No Creator"
# thumbnail = item.find("media:thumbnail")['url'] if item.find("media:thumbnail") else "No Thumbnail"
# guid = item.find("guid").get_text() if item.find("guid") else "No GUID"
# description = item.find("description").get_text() if item.find("description") else "No Description"
# content_encoded = item.find("content:encoded").decode_contents() if item.find("content:encoded") else "No Content"

# news_item = {
# "title": title,
# "link": link,
# "identifier": identifier,
# "pub_date": pub_date,
# "creator": creator,
# "thumbnail": thumbnail,
# "guid": guid,
# "description": description,
# "content": content_encoded,
# }
# news_items.append(news_item)


# return news_items


# def load_sample_news():
# feed_url = "https://cdn.feedcontrol.net/7512/12213-hIFHBiLc7Wh50.xml"
# items = extract_items_from_rss(feed_url)
# documents = []
# for item in items:
# doc = Document(
# page_content=item['content'],
# metadata={"title": item["title"], "source": item["link"], "identifier": item["identifier"], "pub_date": item["pub_date"], "creator": item[creator],
# "thumbnail": item[thumbnail], "guid": item[guid], "description": item[description]}
# )
# documents.append(doc)

# return documents


# def load_sample_news():
# return SitemapLoader(
# "https://cdn.feedcontrol.net/7512/12213-hIFHBiLc7Wh50.xml",
# filter_urls=[],
# parsing_function=simple_extractor,
# default_parser="lxml",
# bs_kwargs={"parse_only": SoupStrainer(name=("article", "title", "html", "lang", "content"))},
# meta_function=metadata_extractor,
# ).load()


def generate_sitemap_xml(xml_string) -> str:
soup = BeautifulSoup(xml_string, "xml")
items = soup.find_all("item")

extracted_data = []
for item in items:
title = item.find("title").get_text() if item.find("title") else "No Title"
link = item.find("link").get_text() if item.find("link") else "No Link"
identifier = item.find("dc:identifier").get_text() if item.find("dc:identifier") else "No Identifier"
pub_date = item.find("pubDate").get_text() if item.find("pubDate") else "No Date"
creator = item.find("dc:creator").get_text() if item.find("dc:creator") else "No Creator"
thumbnail = item.find("media:thumbnail")['url'] if item.find("media:thumbnail") else "No Thumbnail"
guid = item.find("guid").get_text() if item.find("guid") else "No GUID"
description = item.find("description").get_text() if item.find("description") else "No Description"
content_encoded = item.find("content:encoded").decode_contents() if item.find("content:encoded") else "No Content"

extracted_data.append({
"title": title,
"link": link,
"identifier": identifier,
"pub_date": pub_date,
"creator": creator,
"thumbnail": thumbnail,
"guid": guid,
"description": description,
"content": content_encoded,
})


urlset = etree.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")

for entry in extracted_data:
url = etree.SubElement(urlset, "url")

loc = etree.SubElement(url, "loc")
loc.text = entry['link']

lastmod = etree.SubElement(url, "lastmod")
lastmod.text = entry['pub_date']

return etree.tostring(urlset, pretty_print=True, xml_declaration=True, encoding='UTF-8').decode('utf-8')

def load_sample_news():
response = requests.get("https://cdn.feedcontrol.net/7512/12213-hIFHBiLc7Wh50.xml")
xml_content = response.content
sitemap_xml = generate_sitemap_xml(xml_content)

with open('sample_news.xml', 'w', encoding='utf-8') as file:
file.write(sitemap_xml)
print("Sitemap XML has been saved to 'sitemap.xml'.")

return SitemapLoader(
"sample_news.xml",
is_local=True,
parsing_function=simple_extractor,
default_parser="lxml",
bs_kwargs={"parse_only": SoupStrainer(name=("article", "title", "html", "lang", "content"))},
meta_function=lambda meta, soup: metadata_extractor(
meta, soup, title_suffix=" | sample_news"
),
).load()

def ingest_docs():
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]
Expand Down Expand Up @@ -157,12 +280,15 @@ def ingest_docs():
logger.info(f"Loaded {len(docs_from_langsmith)} docs from LangSmith")
docs_from_langgraph = load_langgraph_docs()
logger.info(f"Loaded {len(docs_from_langgraph)} docs from LangGraph")
docs_from_sample_news = load_sample_news()
logger.info(f"Loaded {len(docs_from_sample_news)} docs from SampleNews")

docs_transformed = text_splitter.split_documents(
docs_from_documentation
+ docs_from_api
+ docs_from_langsmith
+ docs_from_langgraph
+ docs_from_sample_news
)
docs_transformed = [
doc for doc in docs_transformed if len(doc.page_content) > 10
Expand Down
Loading