-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.py
More file actions
80 lines (67 loc) · 2.91 KB
/
test.py
File metadata and controls
80 lines (67 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import requests
from bs4 import BeautifulSoup
from langchain_upstage import UpstageEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_upstage import ChatUpstage
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from concurrent.futures import ThreadPoolExecutor
import re
from datetime import datetime
# Upstage API 키 설정
os.environ["UPSTAGE_API_KEY"] = "up_coecXafSJVG1v17EEZ3lxjFbZ8xcD"
# 모델 초기화
embedding_model = UpstageEmbeddings(model="solar-embedding-1-large")
upstage_llm = ChatUpstage(api_key=os.getenv("UPSTAGE_API_KEY"))
# URL에서 텍스트를 추출하는 함수
def extract_text_from_urls(urls):
def fetch_text(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return "\n".join([para.get_text() for para in soup.find_all('p')])
except Exception as e:
print(f"Error processing {url}: {e}")
return ""
with ThreadPoolExecutor() as executor:
return list(executor.map(fetch_text, urls))
# 최신 wr_id를 얻는 함수
def get_latest_wr_id():
url = "https://cse.knu.ac.kr/bbs/board.php?bo_table=sub5_1"
response = requests.get(url)
if response.status_code == 200:
match = re.search(r'wr_id=(\d+)', response.text)
if match:
return int(match.group(1))
return None
# 스크래핑할 URL 목록 생성
now_number = get_latest_wr_id()
urls = [f"https://cse.knu.ac.kr/bbs/board.php?bo_table=sub5_1&wr_id={num}" for num in range(now_number, now_number-30, -1)]
# URL에서 문서 추출
document_texts = extract_text_from_urls(urls)
# 텍스트 분리기
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = [chunk for doc in document_texts for chunk in text_splitter.split_text(doc) if doc]
# FAISS 벡터스토어 초기화
vectorstore = FAISS.from_texts(texts, embedding_model)
# ConversationBufferMemory 초기화
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# ConversationalRetrievalChain 생성
qa = ConversationalRetrievalChain.from_llm(
llm=upstage_llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
def get_ai_message(user_question):
try:
# 현재 시간 가져오기
current_time = datetime.now().strftime("%Y-%m-%d")
# 사용자 질문에 현재 시간 포함
user_question_with_time = f"{user_question} (현재 시간: {current_time})"
# 대화 기록을 포함하여 질문에 답변
result = qa({"question": user_question_with_time})
return result['answer']
except Exception as e:
return f"답변을 생성하는 중 오류가 발생했습니다: {e}"