Skip to content

Commit

Permalink
init repo
Browse files Browse the repository at this point in the history
  • Loading branch information
TideDra committed Nov 23, 2024
0 parents commit 5c8ba12
Show file tree
Hide file tree
Showing 9 changed files with 1,237 additions and 0 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Send emails daily
on:
workflow_dispatch:
schedule:
- cron: '0 6 * * *'

jobs:
calculate-and-send:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3

- name: Setup uv
uses: astral-sh/setup-uv@v3
with:
version: '0.5.4'

- name: Run script
env:
ZOTERO_ID: ${{ secrets.ZOTERO_ID }}
ZOTERO_KEY: ${{ secrets.ZOTERO_KEY }}
ARXIV_QUERY: ${{ secrets.ARXIV_QUERY }}
SMTP_SERVER: ${{ secrets.SMTP_SERVER }}
SMTP_PORT: ${{ secrets.SMTP_PORT }}
SENDER: ${{ secrets.SENDER }}
RECEIVER: ${{ secrets.RECEIVER }}
SENDER_PASSWORD: ${{ secrets.SENDER_PASSWORD }}
run: |
uv run main.py
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info

# Virtual environments
.venv

test.ipynb
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11
Empty file added README.md
Empty file.
116 changes: 116 additions & 0 deletions construct_email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import arxiv
import math
framework = """
<!DOCTYPE HTML>
<html>
<head>
<style>
.star-wrapper {
font-size: 2em; /* 调整星星大小 */
line-height: 1; /* 确保垂直对齐 */
display: inline-flex;
align-items: center; /* 保持对齐 */
}
.half-star {
display: inline-block;
width: 0.5em; /* 半颗星的宽度 */
overflow: hidden;
white-space: nowrap;
vertical-align: middle;
}
.full-star {
vertical-align: middle;
}
</style>
</head>
<body>
<div>
__CONTENT__
</div>
<br><br>
<div>
To unsubscribe, remove your email in your Github Action setting.
</div>
</body>
</html>
"""



def get_block_html(title:str, authors:str, rate:str,arxiv_id:str, abstract:str, pdf_url:str, code_url:str=None):
code = f'<a href="{code_url}" style="display: inline-block; text-decoration: none; font-size: 14px; font-weight: bold; color: #fff; background-color: #5bc0de; padding: 8px 16px; border-radius: 4px; margin-left: 8px;">Code</a>' if code_url else ''
block_template = """
<table border="0" cellpadding="0" cellspacing="0" width="100%" style="font-family: Arial, sans-serif; border: 1px solid #ddd; border-radius: 8px; padding: 16px; background-color: #f9f9f9;">
<tr>
<td style="font-size: 20px; font-weight: bold; color: #333;">
{title}
</td>
</tr>
<tr>
<td style="font-size: 14px; color: #666; padding: 8px 0;">
{authors}
</td>
</tr>
<tr>
<td style="font-size: 14px; color: #333; padding: 8px 0;">
<strong>Relevance:</strong> {rate}
</td>
</tr>
<tr>
<td style="font-size: 14px; color: #333; padding: 8px 0;">
<strong>arXiv ID:</strong> {arxiv_id}
</td>
</tr>
<tr>
<td style="font-size: 14px; color: #333; padding: 8px 0;">
<strong>Abstract:</strong> {abstract}
</td>
</tr>
<tr>
<td style="padding: 8px 0;">
<a href="{pdf_url}" style="display: inline-block; text-decoration: none; font-size: 14px; font-weight: bold; color: #fff; background-color: #d9534f; padding: 8px 16px; border-radius: 4px;">PDF</a>
{code}
</td>
</tr>
</table>
"""
return block_template.format(title=title, authors=authors,rate=rate,arxiv_id=arxiv_id, abstract=abstract, pdf_url=pdf_url, code=code)

def get_stars(score:float):
full_star = '<span class="full-star">⭐</span>'
half_star = '<span class="half-star">⭐</span>'
low = 6
high = 8
if score <= low:
return ''
elif score >= high:
return full_star * 5
else:
interval = (high-low) / 10
star_num = math.ceil((score-low) / interval)
full_star_num = int(star_num/2)
half_star_num = star_num - full_star_num * 2
return '<div class="star-wrapper">'+full_star * full_star_num + half_star * half_star_num + '</div>'


def render_email(papers:list[arxiv.Result]):
parts = []
for p in papers:
# crop the abstract
summary = p.summary
summary = summary[:min(600, len(summary))]
if len(summary) == 600:
summary += '...'
rate = get_stars(p.score)
authors = [a.name for a in p.authors[:5]]
authors = ', '.join(authors)
if len(p.authors) > 5:
authors += ', ...'
parts.append(get_block_html(p.title, authors,rate,p.arxiv_id ,summary, p.pdf_url, p.code_url))

content = '<br>' + '</br><br>'.join(parts) + '</br>'
return framework.replace('__CONTENT__', content)
111 changes: 111 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import arxiv
import argparse
import os
from pyzotero import zotero
from recommender import rerank_paper
from construct_email import render_email
import requests
import datetime
import re
from time import sleep
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib

def get_zotero_corpus(id:str,key:str) -> list[dict]:
zot = zotero.Zotero(id, 'user', key)
corpus = zot.everything(zot.items(itemType='conferencePaper || journalArticle || preprint'))
corpus = [c for c in corpus if c['data']['abstractNote'] != '']
return corpus

def get_paper_code_url(paper:arxiv.Result) -> str:
retry_num = 5
while retry_num > 0:
try:
paper_list = requests.get(f'https://paperswithcode.com/api/v1/papers/?arxiv_id={paper.arxiv_id}').json()
break
except:
sleep(1)
retry_num -= 1
if retry_num == 0:
return None

if paper_list.get('count',0) == 0:
return None
paper_id = paper_list['results'][0]['id']
retry_num = 5
while retry_num > 0:
try:
repo_list = requests.get(f'https://paperswithcode.com/api/v1/papers/{paper_id}/repositories/').json()
break
except:
sleep(1)
retry_num -= 1
if retry_num == 0:
return None
if repo_list.get('count',0) == 0:
return None
return repo_list['results'][0]['url']

def get_arxiv_paper(query:str, start:datetime.datetime, end:datetime.datetime) -> list[arxiv.Result]:
client = arxiv.Client()
search = arxiv.Search(query=query, sort_by=arxiv.SortCriterion.SubmittedDate)
papers = []
for i in client.results(search):
published_date = i.published
if published_date < end and published_date >= start:
i.arxiv_id = re.sub(r'v\d+$', '', i.get_short_id())
i.code_url = get_paper_code_url(i)
papers.append(i)
elif published_date < start:
break
return papers

def send_email(sender:str, receiver:str, password:str,smtp_server:str,smtp_port:int, html:str,):
def _format_addr(s):
name, addr = parseaddr(s)
return formataddr((Header(name, 'utf-8').encode(), addr))

msg = MIMEText(html, 'html', 'utf-8')
msg['From'] = _format_addr('Github Action <%s>' % sender)
msg['To'] = _format_addr('You <%s>' % receiver)
today = datetime.datetime.now().strftime('%Y/%m/%d')
msg['Subject'] = Header(f'Daily arXiv {today}', 'utf-8').encode()

server = smtplib.SMTP(smtp_server, smtp_port)
server.starttls()
server.login(sender, password)
server.sendmail(sender, [receiver], msg.as_string())
server.quit()

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recommender system for academic papers')
parser.add_argument('--zotero_id', type=str, help='Zotero user ID',default=os.environ.get('ZOTERO_ID'))
parser.add_argument('--zotero_key', type=str, help='Zotero API key',default=os.environ.get('ZOTERO_KEY'))
parser.add_argument('--arxiv_query', type=str, help='Arxiv search query',default=os.environ.get('ARXIV_QUERY'))
parser.add_argument('--smtp_server', type=str, help='SMTP server',default=os.environ.get('SMTP_SERVER'))
parser.add_argument('--smtp_port', type=int, help='SMTP port',default=os.environ.get('SMTP_PORT'))
parser.add_argument('--sender', type=str, help='Sender email address',default=os.environ.get('SENDER'))
parser.add_argument('--receiver', type=str, help='Receiver email address',default=os.environ.get('RECEIVER'))
parser.add_argument('--password', type=str, help='Sender email password',default=os.environ.get('SENDER_PASSWORD'))
args = parser.parse_args()
assert args.zotero_id is not None
assert args.zotero_key is not None
assert args.arxiv_query is not None
today = datetime.datetime.now(tz=datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
yesterday = today - datetime.timedelta(days=2)
print("Retrieving Zotero corpus...")
corpus = get_zotero_corpus(args.zotero_id, args.zotero_key)
print("Retrieving Arxiv papers...")
papers = get_arxiv_paper(args.arxiv_query, yesterday, today)
if len(papers) == 0:
print("No new papers found.")
exit(0)
print("Reranking papers...")
papers = rerank_paper(papers, corpus)
html = render_email(papers)
print("Sending email...")
send_email(args.sender, args.receiver, args.password, args.smtp_server, args.smtp_port, html)
with open('email.html', 'w') as f:
f.write(html)
13 changes: 13 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[project]
name = "zotero-arxiv-daily"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"arxiv>=2.1.3",
"pyzotero>=1.5.25",
"scikit-learn>=1.5.2",
"sentence-transformers>=3.3.1",
"typing-extensions>=4.12.2",
]
19 changes: 19 additions & 0 deletions recommender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np
from sentence_transformers import SentenceTransformer
import arxiv
from datetime import datetime

def rerank_paper(candidate:list[arxiv.Result],corpus:list[dict],model:str='avsolatorio/GIST-small-Embedding-v0') -> np.ndarray:
encoder = SentenceTransformer(model)
#sort corpus by date, from newest to oldest
corpus = sorted(corpus,key=lambda x: datetime.strptime(x['data']['dateAdded'], '%Y-%m-%dT%H:%M:%SZ'),reverse=True)
time_decay_weight = 1 / (1 + np.log10(np.arange(len(corpus)) + 1))
time_decay_weight = time_decay_weight / time_decay_weight.sum()
corpus_feature = encoder.encode([paper['data']['abstractNote'] for paper in corpus])
candidate_feature = encoder.encode([paper.summary for paper in candidate])
sim = encoder.similarity(candidate_feature,corpus_feature) # [n_candidate, n_corpus]
scores = (sim * time_decay_weight).sum(axis=1) * 10 # [n_candidate]
for s,c in zip(scores,candidate):
c.score = s.item()
candidate = sorted(candidate,key=lambda x: x.score,reverse=True)
return candidate
Loading

0 comments on commit 5c8ba12

Please sign in to comment.