diff --git a/mastotuit.py b/mastotuit.py index e2ddffe..7629acc 100644 --- a/mastotuit.py +++ b/mastotuit.py @@ -1,6 +1,6 @@ import os import feedparser -import html2text +from bs4 import BeautifulSoup from mastodon import Mastodon import psycopg2 import sys @@ -17,14 +17,21 @@ logger = logging.getLogger() def get_toot_text(title): - html2text.hn = lambda _:0 - h = html2text.HTML2Text() - h.images_to_alt = True - h.single_line_break = True - h.ignore_emphasis = True - h.ignore_links = True - h.ignore_tables = True - tuit_text = h.handle(title) + soup = BeautifulSoup(title, features='html.parser') + + delimiter = '###' # unambiguous string + + for line_break in soup.findAll('br'): # loop through line break tags + + line_break.replaceWith(delimiter) # replace br tags with delimiter + + tuit_text_str = soup.get_text().split(delimiter) # get list of strings + + tuit_text = '' + + for line in tuit_text_str: + + tuit_text += f'{line}\n' return tuit_text diff --git a/requirements.txt b/requirements.txt index df97b66..d06604c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ wheel>=0.37.0 psycopg2>=2.9.1 feedparser>=6.0.8 -html2text>=2020.1.16 +bs4>=4.10.0 Mastodon.py>=1.5.1 tweepy>=4.5.0 filetype>=1.0.8