Changed html parser from BeatifulSoup to html2text

2022-01-11 15:11:36 +01:00 · 2022-01-11 15:11:36 +01:00 · b1b1718b67
commit b1b1718b67
--- a/mastotuit.py
+++ b/mastotuit.py
@ -1,6 +1,6 @@
 import os
 import feedparser
-from bs4 import BeautifulSoup
+import html2text
 from mastodon import Mastodon
 import psycopg2
 import sys
@ -16,37 +16,16 @@ import pdb
 logger = logging.getLogger()
-def get_toot(title):
+def get_toot_text(title):
-    soup = BeautifulSoup(title, 'html.parser')
+    html2text.hn = lambda _:0
-
+    h = html2text.HTML2Text()
-    toot_text = soup.get_text()
+    h.images_to_alt = True
-
+    h.single_line_break = True
-    sub_str = 'http'
+    h.ignore_emphasis = True
-    find_link = toot_text.find(sub_str)
+    h.ignore_links = True
-    if find_link != -1:
+    h.ignore_tables = True
-
+    tuit_text = h.handle(title)
        tuit_text = toot_text[:toot_text.index(sub_str)]
    else:
        tuit_text = toot_text
    links_lst = ''
    for links in soup.find_all('a'):
        find_tag = links.get('href').find('/tags/')
        if find_tag == -1:
            links_lst += links.get('href')
    if len(links_lst) > 0:
        last_text = toot_text[len(tuit_text) + len(links_lst):]
    else:
        last_text = ''
    tuit_text = f'{tuit_text} {links_lst} {last_text}'
    return tuit_text
@ -337,7 +316,7 @@ if __name__ == '__main__':
        if publish:
-            tuit_text = get_toot(title)
+            tuit_text = get_toot_text(title)
            print("Tooting...")
            print(tuit_text)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 wheel>=0.37.0
 psycopg2>=2.9.1
 feedparser>=6.0.8
-bs4>=0.0.1
+html2text>=2020.1.16
 Mastodon.py>=1.5.1
 tweepy==4.1.0
 filetype>=1.0.8