Initial Bluesky support. (#22)

* Initial Bluesky support. * Added atproto requirement * Use the article uri as unique id instead of short_url. * Fixed rendering diffs. * Launch the browser in truly headless mode.
j-e-d · Mar 25, 2024 · 6572ca2 · 6572ca2
1 parent 0b59644
commit 6572ca2
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 44 deletions.
diff --git a/css/styles.css b/css/styles.css
@@ -16,6 +16,11 @@ p {
     margin-right: 2em;
     margin-top: 1em;
     margin-bottom: 1em;
+    padding-left: 25px;
+    padding-right: 25px;
+    padding-top: 10px;
+    padding-bottom: 10px;
+    width: 350px;
     font-weight: normal;
 }
 

diff --git a/nytdiff.py b/nytdiff.py
@@ -22,6 +22,8 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 
+from atproto import Client, models
+
 TIMEZONE = 'America/Buenos_Aires'
 LOCAL_TZ = timezone(TIMEZONE)
 MAX_RETRIES = 10
@@ -44,14 +46,15 @@
 
 
 class BaseParser(object):
-    def __init__(self, api):
+    def __init__(self, api, bsky_api=None):
         self.urls = list()
         self.payload = None
         self.articles = dict()
         self.current_ids = set()
         self.filename = str()
         self.db = dataset.connect('sqlite:///titles.db')
         self.api = api
+        self.bsky_api = bsky_api
 
     def test_twitter(self):
         print(self.api.rate_limit_status())
@@ -82,6 +85,26 @@ def get_prev_tweet(self, article_id, column):
             else:
                 return None
 
+    def get_bsky_parent(self, article_id, column):
+        # Returns a tuple (parent, root) of bluesky "strong refs" for
+        # the previously posted article in this thread
+        # If no parent is found, returns (None, None)
+        if column == 'id':
+            search = self.articles_table.find_one(id=article_id)
+        else:
+            search = self.articles_table.find_one(article_id=article_id)
+        if search and search.get('post_uri'):
+            post_uri = search['post_uri']
+            post_cid = search['post_cid']
+            root_uri = search['root_uri']
+            root_cid = search['root_cid']
+            return (
+                models.ComAtprotoRepoStrongRef.Main(uri=post_uri, cid=post_cid),
+                models.ComAtprotoRepoStrongRef.Main(uri=root_uri, cid=root_cid),
+            )
+        else:
+            return (None, None)
+
     def update_tweet_db(self, article_id, tweet_id, column):
         if column == 'id':
             article = {
@@ -96,6 +119,17 @@ def update_tweet_db(self, article_id, tweet_id, column):
         self.articles_table.update(article, [column])
         logging.debug('Updated tweet ID in db')
 
+    def update_bsky_db(self, article_id, post_ref, root_ref, column):
+        article = {
+            column: article_id,
+            'post_uri': post_ref.uri,
+            'post_cid': post_ref.cid,
+            'root_uri': root_ref.uri,
+            'root_cid': root_ref.cid,
+        }
+        self.articles_table.update(article, [column])
+        logging.debug('Updated bsky refs in db')
+
     def media_upload(self, filename):
         if TESTING:
             return 1
@@ -156,6 +190,62 @@ def tweet(self, text, article_id, url, column='id'):
             self.update_tweet_db(article_id, tweet.id, column)
         return
 
+    def bsky_website_card(self, article_data):
+        # Generate a website preview card for the specified url
+        # Returns a models.AppBskyEmbedExternal object suitable
+        # for passing as the `embed' argument to atproto.send_post
+        post_title = article_data['title']
+        post_description = article_data['abstract']
+        post_uri = article_data['url']
+        extra_args = {}
+        if 'thumbnail' in article_data:
+            r = requests.get(url=article_data['thumbnail'])
+            if r.ok:
+                thumb = self.bsky_api.upload_blob(r.content)
+                extra_args['thumb'] = thumb.blob
+
+        return models.AppBskyEmbedExternal.Main(
+            external=models.AppBskyEmbedExternal.External(
+                title=post_title,
+                description=post_description,
+                uri=post_uri,
+                **extra_args
+            )
+        )
+
+    def bsky_post(self, text, article_data, column='id'):
+        article_id = article_data['article_id']
+        url = article_data['url']
+        img_path = './output/' + self.filename + '.png'
+        with open(img_path, 'rb') as f:
+            img_data = f.read()
+        logging.info('Media ready with ids: %s', img_path)
+        logging.info('Text to post: %s', text)
+        logging.info('Article id: %s', article_id)
+        (parent_ref, root_ref) = self.get_bsky_parent(article_id, column)
+        logging.info('Parent ref: %s', parent_ref)
+        logging.info('Root ref: %s', root_ref)
+        if parent_ref is None:
+            # No parent, let's start a new thread
+            logging.info('Posting url: %s', url)
+            post = self.bsky_api.send_post(
+                '', embed=self.bsky_website_card(article_data)
+            )
+            root_ref = models.create_strong_ref(post)
+            parent_ref = root_ref
+
+        logging.info('Replying to: %s', parent_ref)
+        post = self.bsky_api.send_image(
+            text=text,
+            image=img_data,
+            image_alt='',
+            reply_to=models.AppBskyFeedPost.ReplyRef(
+                parent=parent_ref, root=root_ref)
+        )
+        child_ref = models.create_strong_ref(post)
+        logging.info('Id to store: %s', child_ref)
+        self.update_bsky_db(article_id, child_ref, root_ref, column)
+
     def get_page(self, url, header=None, payload=None):
         for x in range(MAX_RETRIES):
             try:
@@ -198,7 +288,7 @@ def show_diff(self, old, new):
         <html lang="en">
           <head>
             <meta charset="utf-8">
-            <link rel="stylesheet" href="styles.css">
+            <link rel="stylesheet" href="css/styles.css">
           </head>
           <body>
           <p>
@@ -207,66 +297,55 @@ def show_diff(self, old, new):
           </body>
         </html>
         """.format(html_diff(old, new))
-        with TemporaryDirectory() as tmpdir:
+        with TemporaryDirectory(delete=False) as tmpdir:
             tmpfile = os.path.join(tmpdir, 'tmp.html')
             with open(tmpfile, 'w') as f:
                 f.write(html)
-            shutil.copy('./css/styles.css', tmpdir)
-            driver = webdriver.Chrome()
+            for d in ['css', 'fonts', 'img']:
+                shutil.copytree(d, os.path.join(tmpdir, d))
+            opts = webdriver.chrome.options.Options()
+            opts.add_argument("--headless")
+            opts.add_argument("--window-size=400,400")
+            driver = webdriver.Chrome(options=opts)
             driver.get('file://{}'.format(tmpfile))
+            logging.info('tmpfile is %s', tmpfile)
 
         e = driver.find_element(By.XPATH, '//p')
-        start_height = e.location['y']
-        block_height = e.size['height']
-        end_height = start_height
-        start_width = e.location['x']
-        block_width = e.size['width']
-        end_width = start_width
-        total_height = start_height + block_height + end_height
-        total_width = start_width + block_width + end_width
         timestamp = str(int(time.time()))
-        driver.save_screenshot('./tmp.png')
-        img = Image.open('./tmp.png')
-        img2 = img.crop((0, 0, total_width, total_height))
-        if int(total_width) > int(total_height * 2):
-            background = Image.new('RGBA', (total_width, int(total_width / 2)),
-                                   (255, 255, 255, 0))
-            bg_w, bg_h = background.size
-            offset = (int((bg_w - total_width) / 2),
-                      int((bg_h - total_height) / 2))
-        else:
-            background = Image.new('RGBA', (total_width, total_height),
-                                   (255, 255, 255, 0))
-            bg_w, bg_h = background.size
-            offset = (int((bg_w - total_width) / 2),
-                      int((bg_h - total_height) / 2))
-        background.paste(img2, offset)
         self.filename = timestamp + new_hash
-        background.save('./output/' + self.filename + '.png')
+        e.screenshot('./output/' + self.filename + '.png')
         return True
 
     def __str__(self):
         return ('\n'.join(self.urls))
 
 
 class NYTParser(BaseParser):
-    def __init__(self, api, nyt_api_key):
-        BaseParser.__init__(self, api)
+    def __init__(self, api, nyt_api_key, bsky_api=None):
+        BaseParser.__init__(self, api, bsky_api=bsky_api)
         self.urls = ['https://api.nytimes.com/svc/topstories/v2/home.json']
         self.payload = {'api-key': nyt_api_key}
         self.articles_table = self.db['nyt_ids']
         self.versions_table = self.db['nyt_versions']
 
+    def get_thumbnail(self, article):
+        # Return the URL for the first thumbnail image in the article.
+        for m in article['multimedia']:
+            if m['type'] == 'image' and m['width'] < 400:
+                return m['url']
+        return None
+
     def json_to_dict(self, article):
         article_dict = dict()
-        if 'short_url' not in article:
+        if not article.get('uri'):
             return None
-        article_dict['article_id'] = article['short_url'].split('/')[-1]
+        article_dict['article_id'] = article['uri']
         article_dict['url'] = article['url']
         article_dict['title'] = article['title']
         article_dict['abstract'] = self.strip_html(article['abstract'])
         article_dict['byline'] = article['byline']
         article_dict['kicker'] = article['kicker']
+        article_dict['thumbnail'] = self.get_thumbnail(article)
         od = collections.OrderedDict(sorted(article_dict.items()))
         article_dict['hash'] = hashlib.sha224(
             repr(od.items()).encode('utf-8')).hexdigest()
@@ -280,7 +359,12 @@ def store_data(self, data):
                 'article_id': data['article_id'],
                 'add_dt': data['date_time'],
                 'status': 'home',
-                'tweet_id': None
+                'thumbnail': data['thumbnail'],
+                'tweet_id': None,
+                'post_uri': None,
+                'post_cid': None,
+                'root_uri': None,
+                'root_cid': None,
             }
             self.articles_table.insert(article)
             logging.info('New article tracked: %s', data['url'])
@@ -314,23 +398,27 @@ def store_data(self, data):
                     if row['url'] != data['url']:
                         if self.show_diff(row['url'], data['url']):
                             tweet_text = 'Change in URL'
-                            self.tweet(tweet_text, data['article_id'], url,
-                                       'article_id')
+                            self.bsky_post(tweet_text, data, 'article_id')
+                            #self.tweet(tweet_text, data['article_id'], url,
+                            #           'article_id')
                     if row['title'] != data['title']:
                         if self.show_diff(row['title'], data['title']):
                             tweet_text = 'Change in Title'
-                            self.tweet(tweet_text, data['article_id'], url,
-                                       'article_id')
+                            self.bsky_post(tweet_text, data, 'article_id')
+                            #self.tweet(tweet_text, data['article_id'], url,
+                            #           'article_id')
                     if row['abstract'] != data['abstract']:
                         if self.show_diff(row['abstract'], data['abstract']):
                             tweet_text = 'Change in Abstract'
-                            self.tweet(tweet_text, data['article_id'], url,
-                                       'article_id')
+                            self.bsky_post(tweet_text, data, 'article_id')
+                            #self.tweet(tweet_text, data['article_id'], url,
+                            #           'article_id')
                     if row['kicker'] != data['kicker']:
                         if self.show_diff(row['kicker'], data['kicker']):
                             tweet_text = 'Change in Kicker'
-                            self.tweet(tweet_text, data['article_id'], url,
-                                       'article_id')
+                            self.bsky_post(tweet_text, data, 'article_id')
+                            #self.tweet(tweet_text, data['article_id'], url,
+                            #           'article_id')
 
     def loop_data(self, data):
         if 'results' not in data:
@@ -389,10 +477,21 @@ def main():
     nyt_api = tweepy.API(auth)
     logging.debug('NYT Twitter API configured')
 
+    bsky_api = None
+    if 'BLUESKY_LOGIN' in os.environ:
+        bsky_login = os.environ['BLUESKY_LOGIN']
+        bsky_passwd = os.environ['BLUESKY_PASSWD']
+        bsky_api = Client(base_url='https://bsky.social')
+        try:
+            bsky_api.login(bsky_login, bsky_passwd)
+        except:
+            logging.exception('Bluesky login failed')
+            return
+
     try:
         logging.debug('Starting NYT')
         nyt_api_key = os.environ['NYT_API_KEY']
-        nyt = NYTParser(nyt_api, nyt_api_key)
+        nyt = NYTParser(nyt_api, nyt_api_key, bsky_api=bsky_api)
         nyt.parse_pages()
         logging.debug('Finished NYT')
     except:

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 alembic>=1.13.1,<2.0.0
+atproto>=0.0.44,<1.0.0
 bleach>=6.1.0,<7.0.0
 dataset>=1.6.2,<2.0.0
 html5lib>=1.1,<2.0