-
-
Notifications
You must be signed in to change notification settings - Fork 615
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 5c8ba12
Showing
9 changed files
with
1,237 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
name: Send emails daily | ||
on: | ||
workflow_dispatch: | ||
schedule: | ||
- cron: '0 6 * * *' | ||
|
||
jobs: | ||
calculate-and-send: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Setup uv | ||
uses: astral-sh/setup-uv@v3 | ||
with: | ||
version: '0.5.4' | ||
|
||
- name: Run script | ||
env: | ||
ZOTERO_ID: ${{ secrets.ZOTERO_ID }} | ||
ZOTERO_KEY: ${{ secrets.ZOTERO_KEY }} | ||
ARXIV_QUERY: ${{ secrets.ARXIV_QUERY }} | ||
SMTP_SERVER: ${{ secrets.SMTP_SERVER }} | ||
SMTP_PORT: ${{ secrets.SMTP_PORT }} | ||
SENDER: ${{ secrets.SENDER }} | ||
RECEIVER: ${{ secrets.RECEIVER }} | ||
SENDER_PASSWORD: ${{ secrets.SENDER_PASSWORD }} | ||
run: | | ||
uv run main.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Python-generated files | ||
__pycache__/ | ||
*.py[oc] | ||
build/ | ||
dist/ | ||
wheels/ | ||
*.egg-info | ||
|
||
# Virtual environments | ||
.venv | ||
|
||
test.ipynb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
3.11 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import arxiv | ||
import math | ||
framework = """ | ||
<!DOCTYPE HTML> | ||
<html> | ||
<head> | ||
<style> | ||
.star-wrapper { | ||
font-size: 2em; /* 调整星星大小 */ | ||
line-height: 1; /* 确保垂直对齐 */ | ||
display: inline-flex; | ||
align-items: center; /* 保持对齐 */ | ||
} | ||
.half-star { | ||
display: inline-block; | ||
width: 0.5em; /* 半颗星的宽度 */ | ||
overflow: hidden; | ||
white-space: nowrap; | ||
vertical-align: middle; | ||
} | ||
.full-star { | ||
vertical-align: middle; | ||
} | ||
</style> | ||
</head> | ||
<body> | ||
<div> | ||
__CONTENT__ | ||
</div> | ||
<br><br> | ||
<div> | ||
To unsubscribe, remove your email in your Github Action setting. | ||
</div> | ||
</body> | ||
</html> | ||
""" | ||
|
||
|
||
|
||
def get_block_html(title:str, authors:str, rate:str,arxiv_id:str, abstract:str, pdf_url:str, code_url:str=None): | ||
code = f'<a href="{code_url}" style="display: inline-block; text-decoration: none; font-size: 14px; font-weight: bold; color: #fff; background-color: #5bc0de; padding: 8px 16px; border-radius: 4px; margin-left: 8px;">Code</a>' if code_url else '' | ||
block_template = """ | ||
<table border="0" cellpadding="0" cellspacing="0" width="100%" style="font-family: Arial, sans-serif; border: 1px solid #ddd; border-radius: 8px; padding: 16px; background-color: #f9f9f9;"> | ||
<tr> | ||
<td style="font-size: 20px; font-weight: bold; color: #333;"> | ||
{title} | ||
</td> | ||
</tr> | ||
<tr> | ||
<td style="font-size: 14px; color: #666; padding: 8px 0;"> | ||
{authors} | ||
</td> | ||
</tr> | ||
<tr> | ||
<td style="font-size: 14px; color: #333; padding: 8px 0;"> | ||
<strong>Relevance:</strong> {rate} | ||
</td> | ||
</tr> | ||
<tr> | ||
<td style="font-size: 14px; color: #333; padding: 8px 0;"> | ||
<strong>arXiv ID:</strong> {arxiv_id} | ||
</td> | ||
</tr> | ||
<tr> | ||
<td style="font-size: 14px; color: #333; padding: 8px 0;"> | ||
<strong>Abstract:</strong> {abstract} | ||
</td> | ||
</tr> | ||
<tr> | ||
<td style="padding: 8px 0;"> | ||
<a href="{pdf_url}" style="display: inline-block; text-decoration: none; font-size: 14px; font-weight: bold; color: #fff; background-color: #d9534f; padding: 8px 16px; border-radius: 4px;">PDF</a> | ||
{code} | ||
</td> | ||
</tr> | ||
</table> | ||
""" | ||
return block_template.format(title=title, authors=authors,rate=rate,arxiv_id=arxiv_id, abstract=abstract, pdf_url=pdf_url, code=code) | ||
|
||
def get_stars(score:float): | ||
full_star = '<span class="full-star">⭐</span>' | ||
half_star = '<span class="half-star">⭐</span>' | ||
low = 6 | ||
high = 8 | ||
if score <= low: | ||
return '' | ||
elif score >= high: | ||
return full_star * 5 | ||
else: | ||
interval = (high-low) / 10 | ||
star_num = math.ceil((score-low) / interval) | ||
full_star_num = int(star_num/2) | ||
half_star_num = star_num - full_star_num * 2 | ||
return '<div class="star-wrapper">'+full_star * full_star_num + half_star * half_star_num + '</div>' | ||
|
||
|
||
def render_email(papers:list[arxiv.Result]): | ||
parts = [] | ||
for p in papers: | ||
# crop the abstract | ||
summary = p.summary | ||
summary = summary[:min(600, len(summary))] | ||
if len(summary) == 600: | ||
summary += '...' | ||
rate = get_stars(p.score) | ||
authors = [a.name for a in p.authors[:5]] | ||
authors = ', '.join(authors) | ||
if len(p.authors) > 5: | ||
authors += ', ...' | ||
parts.append(get_block_html(p.title, authors,rate,p.arxiv_id ,summary, p.pdf_url, p.code_url)) | ||
|
||
content = '<br>' + '</br><br>'.join(parts) + '</br>' | ||
return framework.replace('__CONTENT__', content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import arxiv | ||
import argparse | ||
import os | ||
from pyzotero import zotero | ||
from recommender import rerank_paper | ||
from construct_email import render_email | ||
import requests | ||
import datetime | ||
import re | ||
from time import sleep | ||
from email.header import Header | ||
from email.mime.text import MIMEText | ||
from email.utils import parseaddr, formataddr | ||
import smtplib | ||
|
||
def get_zotero_corpus(id:str,key:str) -> list[dict]: | ||
zot = zotero.Zotero(id, 'user', key) | ||
corpus = zot.everything(zot.items(itemType='conferencePaper || journalArticle || preprint')) | ||
corpus = [c for c in corpus if c['data']['abstractNote'] != ''] | ||
return corpus | ||
|
||
def get_paper_code_url(paper:arxiv.Result) -> str: | ||
retry_num = 5 | ||
while retry_num > 0: | ||
try: | ||
paper_list = requests.get(f'https://paperswithcode.com/api/v1/papers/?arxiv_id={paper.arxiv_id}').json() | ||
break | ||
except: | ||
sleep(1) | ||
retry_num -= 1 | ||
if retry_num == 0: | ||
return None | ||
|
||
if paper_list.get('count',0) == 0: | ||
return None | ||
paper_id = paper_list['results'][0]['id'] | ||
retry_num = 5 | ||
while retry_num > 0: | ||
try: | ||
repo_list = requests.get(f'https://paperswithcode.com/api/v1/papers/{paper_id}/repositories/').json() | ||
break | ||
except: | ||
sleep(1) | ||
retry_num -= 1 | ||
if retry_num == 0: | ||
return None | ||
if repo_list.get('count',0) == 0: | ||
return None | ||
return repo_list['results'][0]['url'] | ||
|
||
def get_arxiv_paper(query:str, start:datetime.datetime, end:datetime.datetime) -> list[arxiv.Result]: | ||
client = arxiv.Client() | ||
search = arxiv.Search(query=query, sort_by=arxiv.SortCriterion.SubmittedDate) | ||
papers = [] | ||
for i in client.results(search): | ||
published_date = i.published | ||
if published_date < end and published_date >= start: | ||
i.arxiv_id = re.sub(r'v\d+$', '', i.get_short_id()) | ||
i.code_url = get_paper_code_url(i) | ||
papers.append(i) | ||
elif published_date < start: | ||
break | ||
return papers | ||
|
||
def send_email(sender:str, receiver:str, password:str,smtp_server:str,smtp_port:int, html:str,): | ||
def _format_addr(s): | ||
name, addr = parseaddr(s) | ||
return formataddr((Header(name, 'utf-8').encode(), addr)) | ||
|
||
msg = MIMEText(html, 'html', 'utf-8') | ||
msg['From'] = _format_addr('Github Action <%s>' % sender) | ||
msg['To'] = _format_addr('You <%s>' % receiver) | ||
today = datetime.datetime.now().strftime('%Y/%m/%d') | ||
msg['Subject'] = Header(f'Daily arXiv {today}', 'utf-8').encode() | ||
|
||
server = smtplib.SMTP(smtp_server, smtp_port) | ||
server.starttls() | ||
server.login(sender, password) | ||
server.sendmail(sender, [receiver], msg.as_string()) | ||
server.quit() | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Recommender system for academic papers') | ||
parser.add_argument('--zotero_id', type=str, help='Zotero user ID',default=os.environ.get('ZOTERO_ID')) | ||
parser.add_argument('--zotero_key', type=str, help='Zotero API key',default=os.environ.get('ZOTERO_KEY')) | ||
parser.add_argument('--arxiv_query', type=str, help='Arxiv search query',default=os.environ.get('ARXIV_QUERY')) | ||
parser.add_argument('--smtp_server', type=str, help='SMTP server',default=os.environ.get('SMTP_SERVER')) | ||
parser.add_argument('--smtp_port', type=int, help='SMTP port',default=os.environ.get('SMTP_PORT')) | ||
parser.add_argument('--sender', type=str, help='Sender email address',default=os.environ.get('SENDER')) | ||
parser.add_argument('--receiver', type=str, help='Receiver email address',default=os.environ.get('RECEIVER')) | ||
parser.add_argument('--password', type=str, help='Sender email password',default=os.environ.get('SENDER_PASSWORD')) | ||
args = parser.parse_args() | ||
assert args.zotero_id is not None | ||
assert args.zotero_key is not None | ||
assert args.arxiv_query is not None | ||
today = datetime.datetime.now(tz=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0) | ||
yesterday = today - datetime.timedelta(days=2) | ||
print("Retrieving Zotero corpus...") | ||
corpus = get_zotero_corpus(args.zotero_id, args.zotero_key) | ||
print("Retrieving Arxiv papers...") | ||
papers = get_arxiv_paper(args.arxiv_query, yesterday, today) | ||
if len(papers) == 0: | ||
print("No new papers found.") | ||
exit(0) | ||
print("Reranking papers...") | ||
papers = rerank_paper(papers, corpus) | ||
html = render_email(papers) | ||
print("Sending email...") | ||
send_email(args.sender, args.receiver, args.password, args.smtp_server, args.smtp_port, html) | ||
with open('email.html', 'w') as f: | ||
f.write(html) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[project] | ||
name = "zotero-arxiv-daily" | ||
version = "0.1.0" | ||
description = "Add your description here" | ||
readme = "README.md" | ||
requires-python = ">=3.11" | ||
dependencies = [ | ||
"arxiv>=2.1.3", | ||
"pyzotero>=1.5.25", | ||
"scikit-learn>=1.5.2", | ||
"sentence-transformers>=3.3.1", | ||
"typing-extensions>=4.12.2", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import numpy as np | ||
from sentence_transformers import SentenceTransformer | ||
import arxiv | ||
from datetime import datetime | ||
|
||
def rerank_paper(candidate:list[arxiv.Result],corpus:list[dict],model:str='avsolatorio/GIST-small-Embedding-v0') -> np.ndarray: | ||
encoder = SentenceTransformer(model) | ||
#sort corpus by date, from newest to oldest | ||
corpus = sorted(corpus,key=lambda x: datetime.strptime(x['data']['dateAdded'], '%Y-%m-%dT%H:%M:%SZ'),reverse=True) | ||
time_decay_weight = 1 / (1 + np.log10(np.arange(len(corpus)) + 1)) | ||
time_decay_weight = time_decay_weight / time_decay_weight.sum() | ||
corpus_feature = encoder.encode([paper['data']['abstractNote'] for paper in corpus]) | ||
candidate_feature = encoder.encode([paper.summary for paper in candidate]) | ||
sim = encoder.similarity(candidate_feature,corpus_feature) # [n_candidate, n_corpus] | ||
scores = (sim * time_decay_weight).sum(axis=1) * 10 # [n_candidate] | ||
for s,c in zip(scores,candidate): | ||
c.score = s.item() | ||
candidate = sorted(candidate,key=lambda x: x.score,reverse=True) | ||
return candidate |
Oops, something went wrong.