Changed html parser from BeatifulSoup to html2text

This commit is contained in:
spla 2022-01-11 15:11:36 +01:00
pare 3ebc539285
commit b1b1718b67
S'han modificat 2 arxius amb 12 adicions i 33 eliminacions

Veure arxiu

@ -1,6 +1,6 @@
import os
import feedparser
from bs4 import BeautifulSoup
import html2text
from mastodon import Mastodon
import psycopg2
import sys
@ -16,37 +16,16 @@ import pdb
logger = logging.getLogger()
def get_toot(title):
def get_toot_text(title):
soup = BeautifulSoup(title, 'html.parser')
toot_text = soup.get_text()
sub_str = 'http'
find_link = toot_text.find(sub_str)
if find_link != -1:
tuit_text = toot_text[:toot_text.index(sub_str)]
else:
tuit_text = toot_text
links_lst = ''
for links in soup.find_all('a'):
find_tag = links.get('href').find('/tags/')
if find_tag == -1:
links_lst += links.get('href')
if len(links_lst) > 0:
last_text = toot_text[len(tuit_text) + len(links_lst):]
else:
last_text = ''
tuit_text = f'{tuit_text} {links_lst} {last_text}'
html2text.hn = lambda _:0
h = html2text.HTML2Text()
h.images_to_alt = True
h.single_line_break = True
h.ignore_emphasis = True
h.ignore_links = True
h.ignore_tables = True
tuit_text = h.handle(title)
return tuit_text
@ -337,7 +316,7 @@ if __name__ == '__main__':
if publish:
tuit_text = get_toot(title)
tuit_text = get_toot_text(title)
print("Tooting...")
print(tuit_text)

Veure arxiu

@ -1,7 +1,7 @@
wheel>=0.37.0
psycopg2>=2.9.1
feedparser>=6.0.8
bs4>=0.0.1
html2text>=2020.1.16
Mastodon.py>=1.5.1
tweepy==4.1.0
filetype>=1.0.8