From b1b1718b67e282734686b5faad5cf6e5eef182bc Mon Sep 17 00:00:00 2001 From: spla Date: Tue, 11 Jan 2022 15:11:36 +0100 Subject: [PATCH] Changed html parser from BeatifulSoup to html2text --- mastotuit.py | 43 +++++++++++-------------------------------- requirements.txt | 2 +- 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/mastotuit.py b/mastotuit.py index 9db7f72..c11de3c 100644 --- a/mastotuit.py +++ b/mastotuit.py @@ -1,6 +1,6 @@ import os import feedparser -from bs4 import BeautifulSoup +import html2text from mastodon import Mastodon import psycopg2 import sys @@ -16,37 +16,16 @@ import pdb logger = logging.getLogger() -def get_toot(title): +def get_toot_text(title): - soup = BeautifulSoup(title, 'html.parser') - - toot_text = soup.get_text() - - sub_str = 'http' - find_link = toot_text.find(sub_str) - if find_link != -1: - - tuit_text = toot_text[:toot_text.index(sub_str)] - - else: - - tuit_text = toot_text - - links_lst = '' - for links in soup.find_all('a'): - find_tag = links.get('href').find('/tags/') - if find_tag == -1: - links_lst += links.get('href') - - if len(links_lst) > 0: - - last_text = toot_text[len(tuit_text) + len(links_lst):] - - else: - - last_text = '' - - tuit_text = f'{tuit_text} {links_lst} {last_text}' + html2text.hn = lambda _:0 + h = html2text.HTML2Text() + h.images_to_alt = True + h.single_line_break = True + h.ignore_emphasis = True + h.ignore_links = True + h.ignore_tables = True + tuit_text = h.handle(title) return tuit_text @@ -337,7 +316,7 @@ if __name__ == '__main__': if publish: - tuit_text = get_toot(title) + tuit_text = get_toot_text(title) print("Tooting...") print(tuit_text) diff --git a/requirements.txt b/requirements.txt index 511d650..1c963f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ wheel>=0.37.0 psycopg2>=2.9.1 feedparser>=6.0.8 -bs4>=0.0.1 +html2text>=2020.1.16 Mastodon.py>=1.5.1 tweepy==4.1.0 filetype>=1.0.8