Changed html parser from BeatifulSoup to html2text

This commit is contained in:
spla 2022-01-11 15:11:36 +01:00
pare 3ebc539285
commit b1b1718b67
S'han modificat 2 arxius amb 12 adicions i 33 eliminacions

Veure arxiu

@ -1,6 +1,6 @@
import os import os
import feedparser import feedparser
from bs4 import BeautifulSoup import html2text
from mastodon import Mastodon from mastodon import Mastodon
import psycopg2 import psycopg2
import sys import sys
@ -16,37 +16,16 @@ import pdb
logger = logging.getLogger() logger = logging.getLogger()
def get_toot(title): def get_toot_text(title):
soup = BeautifulSoup(title, 'html.parser') html2text.hn = lambda _:0
h = html2text.HTML2Text()
toot_text = soup.get_text() h.images_to_alt = True
h.single_line_break = True
sub_str = 'http' h.ignore_emphasis = True
find_link = toot_text.find(sub_str) h.ignore_links = True
if find_link != -1: h.ignore_tables = True
tuit_text = h.handle(title)
tuit_text = toot_text[:toot_text.index(sub_str)]
else:
tuit_text = toot_text
links_lst = ''
for links in soup.find_all('a'):
find_tag = links.get('href').find('/tags/')
if find_tag == -1:
links_lst += links.get('href')
if len(links_lst) > 0:
last_text = toot_text[len(tuit_text) + len(links_lst):]
else:
last_text = ''
tuit_text = f'{tuit_text} {links_lst} {last_text}'
return tuit_text return tuit_text
@ -337,7 +316,7 @@ if __name__ == '__main__':
if publish: if publish:
tuit_text = get_toot(title) tuit_text = get_toot_text(title)
print("Tooting...") print("Tooting...")
print(tuit_text) print(tuit_text)

Veure arxiu

@ -1,7 +1,7 @@
wheel>=0.37.0 wheel>=0.37.0
psycopg2>=2.9.1 psycopg2>=2.9.1
feedparser>=6.0.8 feedparser>=6.0.8
bs4>=0.0.1 html2text>=2020.1.16
Mastodon.py>=1.5.1 Mastodon.py>=1.5.1
tweepy==4.1.0 tweepy==4.1.0
filetype>=1.0.8 filetype>=1.0.8