From b1b1718b67e282734686b5faad5cf6e5eef182bc Mon Sep 17 00:00:00 2001
From: spla <salvador.pla@mastodont.cat>
Date: Tue, 11 Jan 2022 15:11:36 +0100
Subject: [PATCH] Changed html parser from BeatifulSoup to html2text

---
 mastotuit.py     | 43 +++++++++++--------------------------------
 requirements.txt |  2 +-
 2 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/mastotuit.py b/mastotuit.py
index 9db7f72..c11de3c 100644
--- a/mastotuit.py
+++ b/mastotuit.py
@@ -1,6 +1,6 @@
 import os
 import feedparser
-from bs4 import BeautifulSoup
+import html2text
 from mastodon import Mastodon
 import psycopg2
 import sys
@@ -16,37 +16,16 @@ import pdb
 
 logger = logging.getLogger()
 
-def get_toot(title):
+def get_toot_text(title):
 
-    soup = BeautifulSoup(title, 'html.parser')
-
-    toot_text = soup.get_text()
-
-    sub_str = 'http'
-    find_link = toot_text.find(sub_str)
-    if find_link != -1:
-
-        tuit_text = toot_text[:toot_text.index(sub_str)]
-
-    else:
-
-        tuit_text = toot_text
-
-    links_lst = ''
-    for links in soup.find_all('a'):
-        find_tag = links.get('href').find('/tags/')
-        if find_tag == -1:
-            links_lst += links.get('href')
-
-    if len(links_lst) > 0:
-
-        last_text = toot_text[len(tuit_text) + len(links_lst):]
-
-    else:
-
-        last_text = ''
-
-    tuit_text = f'{tuit_text} {links_lst} {last_text}'
+    html2text.hn = lambda _:0
+    h = html2text.HTML2Text()
+    h.images_to_alt = True
+    h.single_line_break = True
+    h.ignore_emphasis = True
+    h.ignore_links = True
+    h.ignore_tables = True
+    tuit_text = h.handle(title)
 
     return tuit_text
 
@@ -337,7 +316,7 @@ if __name__ == '__main__':
 
         if publish:
 
-            tuit_text = get_toot(title)
+            tuit_text = get_toot_text(title)
 
             print("Tooting...")
             print(tuit_text)
diff --git a/requirements.txt b/requirements.txt
index 511d650..1c963f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 wheel>=0.37.0
 psycopg2>=2.9.1
 feedparser>=6.0.8
-bs4>=0.0.1
+html2text>=2020.1.16
 Mastodon.py>=1.5.1
 tweepy==4.1.0
 filetype>=1.0.8