Changed html parser from BeatifulSoup to html2text

2022-01-11 15:11:36 +01:00 · 2022-01-11 15:11:36 +01:00 · b1b1718b67
commit b1b1718b67
--- a/mastotuit.py
+++ b/mastotuit.py
@ -1,6 +1,6 @@
 import os
 import feedparser
-from bs4 import BeautifulSoup
+import html2text
 from mastodon import Mastodon
 import psycopg2
 import sys
@ -16,37 +16,16 @@ import pdb

 logger = logging.getLogger()

-def get_toot(title):
+def get_toot_text(title):

-    soup = BeautifulSoup(title, 'html.parser')
-
-    toot_text = soup.get_text()
-
-    sub_str = 'http'
-    find_link = toot_text.find(sub_str)
-    if find_link != -1:
-
-        tuit_text = toot_text[:toot_text.index(sub_str)]
-
-    else:
-
-        tuit_text = toot_text
-
-    links_lst = ''
-    for links in soup.find_all('a'):
-        find_tag = links.get('href').find('/tags/')
-        if find_tag == -1:
-            links_lst += links.get('href')
-
-    if len(links_lst) > 0:
-
-        last_text = toot_text[len(tuit_text) + len(links_lst):]
-
-    else:
-
-        last_text = ''
-
-    tuit_text = f'{tuit_text} {links_lst} {last_text}'
+    html2text.hn = lambda _:0
+    h = html2text.HTML2Text()
+    h.images_to_alt = True
+    h.single_line_break = True
+    h.ignore_emphasis = True
+    h.ignore_links = True
+    h.ignore_tables = True
+    tuit_text = h.handle(title)

    return tuit_text

@ -337,7 +316,7 @@ if __name__ == '__main__':

        if publish:

-            tuit_text = get_toot(title)
+            tuit_text = get_toot_text(title)

            print("Tooting...")
            print(tuit_text)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 wheel>=0.37.0
 psycopg2>=2.9.1
 feedparser>=6.0.8
-bs4>=0.0.1
+html2text>=2020.1.16
 Mastodon.py>=1.5.1
 tweepy==4.1.0
 filetype>=1.0.8