Skip to content

Commit

Permalink
Initial Bluesky support. (#22)
Browse files Browse the repository at this point in the history
* Initial Bluesky support.

* Added atproto requirement

* Use the article uri as unique id instead of short_url.

* Fixed rendering diffs.

* Launch the browser in truly headless mode.
  • Loading branch information
qwrrty authored Mar 25, 2024
1 parent 0b59644 commit 6572ca2
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 44 deletions.
5 changes: 5 additions & 0 deletions css/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ p {
margin-right: 2em;
margin-top: 1em;
margin-bottom: 1em;
padding-left: 25px;
padding-right: 25px;
padding-top: 10px;
padding-bottom: 10px;
width: 350px;
font-weight: normal;
}

Expand Down
187 changes: 143 additions & 44 deletions nytdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from selenium import webdriver
from selenium.webdriver.common.by import By

from atproto import Client, models

TIMEZONE = 'America/Buenos_Aires'
LOCAL_TZ = timezone(TIMEZONE)
MAX_RETRIES = 10
Expand All @@ -44,14 +46,15 @@


class BaseParser(object):
def __init__(self, api):
def __init__(self, api, bsky_api=None):
self.urls = list()
self.payload = None
self.articles = dict()
self.current_ids = set()
self.filename = str()
self.db = dataset.connect('sqlite:///titles.db')
self.api = api
self.bsky_api = bsky_api

def test_twitter(self):
print(self.api.rate_limit_status())
Expand Down Expand Up @@ -82,6 +85,26 @@ def get_prev_tweet(self, article_id, column):
else:
return None

def get_bsky_parent(self, article_id, column):
# Returns a tuple (parent, root) of bluesky "strong refs" for
# the previously posted article in this thread
# If no parent is found, returns (None, None)
if column == 'id':
search = self.articles_table.find_one(id=article_id)
else:
search = self.articles_table.find_one(article_id=article_id)
if search and search.get('post_uri'):
post_uri = search['post_uri']
post_cid = search['post_cid']
root_uri = search['root_uri']
root_cid = search['root_cid']
return (
models.ComAtprotoRepoStrongRef.Main(uri=post_uri, cid=post_cid),
models.ComAtprotoRepoStrongRef.Main(uri=root_uri, cid=root_cid),
)
else:
return (None, None)

def update_tweet_db(self, article_id, tweet_id, column):
if column == 'id':
article = {
Expand All @@ -96,6 +119,17 @@ def update_tweet_db(self, article_id, tweet_id, column):
self.articles_table.update(article, [column])
logging.debug('Updated tweet ID in db')

def update_bsky_db(self, article_id, post_ref, root_ref, column):
article = {
column: article_id,
'post_uri': post_ref.uri,
'post_cid': post_ref.cid,
'root_uri': root_ref.uri,
'root_cid': root_ref.cid,
}
self.articles_table.update(article, [column])
logging.debug('Updated bsky refs in db')

def media_upload(self, filename):
if TESTING:
return 1
Expand Down Expand Up @@ -156,6 +190,62 @@ def tweet(self, text, article_id, url, column='id'):
self.update_tweet_db(article_id, tweet.id, column)
return

def bsky_website_card(self, article_data):
# Generate a website preview card for the specified url
# Returns a models.AppBskyEmbedExternal object suitable
# for passing as the `embed' argument to atproto.send_post
post_title = article_data['title']
post_description = article_data['abstract']
post_uri = article_data['url']
extra_args = {}
if 'thumbnail' in article_data:
r = requests.get(url=article_data['thumbnail'])
if r.ok:
thumb = self.bsky_api.upload_blob(r.content)
extra_args['thumb'] = thumb.blob

return models.AppBskyEmbedExternal.Main(
external=models.AppBskyEmbedExternal.External(
title=post_title,
description=post_description,
uri=post_uri,
**extra_args
)
)

def bsky_post(self, text, article_data, column='id'):
article_id = article_data['article_id']
url = article_data['url']
img_path = './output/' + self.filename + '.png'
with open(img_path, 'rb') as f:
img_data = f.read()
logging.info('Media ready with ids: %s', img_path)
logging.info('Text to post: %s', text)
logging.info('Article id: %s', article_id)
(parent_ref, root_ref) = self.get_bsky_parent(article_id, column)
logging.info('Parent ref: %s', parent_ref)
logging.info('Root ref: %s', root_ref)
if parent_ref is None:
# No parent, let's start a new thread
logging.info('Posting url: %s', url)
post = self.bsky_api.send_post(
'', embed=self.bsky_website_card(article_data)
)
root_ref = models.create_strong_ref(post)
parent_ref = root_ref

logging.info('Replying to: %s', parent_ref)
post = self.bsky_api.send_image(
text=text,
image=img_data,
image_alt='',
reply_to=models.AppBskyFeedPost.ReplyRef(
parent=parent_ref, root=root_ref)
)
child_ref = models.create_strong_ref(post)
logging.info('Id to store: %s', child_ref)
self.update_bsky_db(article_id, child_ref, root_ref, column)

def get_page(self, url, header=None, payload=None):
for x in range(MAX_RETRIES):
try:
Expand Down Expand Up @@ -198,7 +288,7 @@ def show_diff(self, old, new):
<html lang="en">
<head>
<meta charset="utf-8">
<link rel="stylesheet" href="styles.css">
<link rel="stylesheet" href="css/styles.css">
</head>
<body>
<p>
Expand All @@ -207,66 +297,55 @@ def show_diff(self, old, new):
</body>
</html>
""".format(html_diff(old, new))
with TemporaryDirectory() as tmpdir:
with TemporaryDirectory(delete=False) as tmpdir:
tmpfile = os.path.join(tmpdir, 'tmp.html')
with open(tmpfile, 'w') as f:
f.write(html)
shutil.copy('./css/styles.css', tmpdir)
driver = webdriver.Chrome()
for d in ['css', 'fonts', 'img']:
shutil.copytree(d, os.path.join(tmpdir, d))
opts = webdriver.chrome.options.Options()
opts.add_argument("--headless")
opts.add_argument("--window-size=400,400")
driver = webdriver.Chrome(options=opts)
driver.get('file://{}'.format(tmpfile))
logging.info('tmpfile is %s', tmpfile)

e = driver.find_element(By.XPATH, '//p')
start_height = e.location['y']
block_height = e.size['height']
end_height = start_height
start_width = e.location['x']
block_width = e.size['width']
end_width = start_width
total_height = start_height + block_height + end_height
total_width = start_width + block_width + end_width
timestamp = str(int(time.time()))
driver.save_screenshot('./tmp.png')
img = Image.open('./tmp.png')
img2 = img.crop((0, 0, total_width, total_height))
if int(total_width) > int(total_height * 2):
background = Image.new('RGBA', (total_width, int(total_width / 2)),
(255, 255, 255, 0))
bg_w, bg_h = background.size
offset = (int((bg_w - total_width) / 2),
int((bg_h - total_height) / 2))
else:
background = Image.new('RGBA', (total_width, total_height),
(255, 255, 255, 0))
bg_w, bg_h = background.size
offset = (int((bg_w - total_width) / 2),
int((bg_h - total_height) / 2))
background.paste(img2, offset)
self.filename = timestamp + new_hash
background.save('./output/' + self.filename + '.png')
e.screenshot('./output/' + self.filename + '.png')
return True

def __str__(self):
return ('\n'.join(self.urls))


class NYTParser(BaseParser):
def __init__(self, api, nyt_api_key):
BaseParser.__init__(self, api)
def __init__(self, api, nyt_api_key, bsky_api=None):
BaseParser.__init__(self, api, bsky_api=bsky_api)
self.urls = ['https://api.nytimes.com/svc/topstories/v2/home.json']
self.payload = {'api-key': nyt_api_key}
self.articles_table = self.db['nyt_ids']
self.versions_table = self.db['nyt_versions']

def get_thumbnail(self, article):
# Return the URL for the first thumbnail image in the article.
for m in article['multimedia']:
if m['type'] == 'image' and m['width'] < 400:
return m['url']
return None

def json_to_dict(self, article):
article_dict = dict()
if 'short_url' not in article:
if not article.get('uri'):
return None
article_dict['article_id'] = article['short_url'].split('/')[-1]
article_dict['article_id'] = article['uri']
article_dict['url'] = article['url']
article_dict['title'] = article['title']
article_dict['abstract'] = self.strip_html(article['abstract'])
article_dict['byline'] = article['byline']
article_dict['kicker'] = article['kicker']
article_dict['thumbnail'] = self.get_thumbnail(article)
od = collections.OrderedDict(sorted(article_dict.items()))
article_dict['hash'] = hashlib.sha224(
repr(od.items()).encode('utf-8')).hexdigest()
Expand All @@ -280,7 +359,12 @@ def store_data(self, data):
'article_id': data['article_id'],
'add_dt': data['date_time'],
'status': 'home',
'tweet_id': None
'thumbnail': data['thumbnail'],
'tweet_id': None,
'post_uri': None,
'post_cid': None,
'root_uri': None,
'root_cid': None,
}
self.articles_table.insert(article)
logging.info('New article tracked: %s', data['url'])
Expand Down Expand Up @@ -314,23 +398,27 @@ def store_data(self, data):
if row['url'] != data['url']:
if self.show_diff(row['url'], data['url']):
tweet_text = 'Change in URL'
self.tweet(tweet_text, data['article_id'], url,
'article_id')
self.bsky_post(tweet_text, data, 'article_id')
#self.tweet(tweet_text, data['article_id'], url,
# 'article_id')
if row['title'] != data['title']:
if self.show_diff(row['title'], data['title']):
tweet_text = 'Change in Title'
self.tweet(tweet_text, data['article_id'], url,
'article_id')
self.bsky_post(tweet_text, data, 'article_id')
#self.tweet(tweet_text, data['article_id'], url,
# 'article_id')
if row['abstract'] != data['abstract']:
if self.show_diff(row['abstract'], data['abstract']):
tweet_text = 'Change in Abstract'
self.tweet(tweet_text, data['article_id'], url,
'article_id')
self.bsky_post(tweet_text, data, 'article_id')
#self.tweet(tweet_text, data['article_id'], url,
# 'article_id')
if row['kicker'] != data['kicker']:
if self.show_diff(row['kicker'], data['kicker']):
tweet_text = 'Change in Kicker'
self.tweet(tweet_text, data['article_id'], url,
'article_id')
self.bsky_post(tweet_text, data, 'article_id')
#self.tweet(tweet_text, data['article_id'], url,
# 'article_id')

def loop_data(self, data):
if 'results' not in data:
Expand Down Expand Up @@ -389,10 +477,21 @@ def main():
nyt_api = tweepy.API(auth)
logging.debug('NYT Twitter API configured')

bsky_api = None
if 'BLUESKY_LOGIN' in os.environ:
bsky_login = os.environ['BLUESKY_LOGIN']
bsky_passwd = os.environ['BLUESKY_PASSWD']
bsky_api = Client(base_url='https://bsky.social')
try:
bsky_api.login(bsky_login, bsky_passwd)
except:
logging.exception('Bluesky login failed')
return

try:
logging.debug('Starting NYT')
nyt_api_key = os.environ['NYT_API_KEY']
nyt = NYTParser(nyt_api, nyt_api_key)
nyt = NYTParser(nyt_api, nyt_api_key, bsky_api=bsky_api)
nyt.parse_pages()
logging.debug('Finished NYT')
except:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
alembic>=1.13.1,<2.0.0
atproto>=0.0.44,<1.0.0
bleach>=6.1.0,<7.0.0
dataset>=1.6.2,<2.0.0
html5lib>=1.1,<2.0
Expand Down

0 comments on commit 6572ca2

Please sign in to comment.