Changed html parser from BeatifulSoup to html2text
This commit is contained in:
pare
3ebc539285
commit
b1b1718b67
S'han modificat 2 arxius amb 12 adicions i 33 eliminacions
43
mastotuit.py
43
mastotuit.py
|
@ -1,6 +1,6 @@
|
|||
import os
|
||||
import feedparser
|
||||
from bs4 import BeautifulSoup
|
||||
import html2text
|
||||
from mastodon import Mastodon
|
||||
import psycopg2
|
||||
import sys
|
||||
|
@ -16,37 +16,16 @@ import pdb
|
|||
|
||||
logger = logging.getLogger()
|
||||
|
||||
def get_toot(title):
|
||||
def get_toot_text(title):
|
||||
|
||||
soup = BeautifulSoup(title, 'html.parser')
|
||||
|
||||
toot_text = soup.get_text()
|
||||
|
||||
sub_str = 'http'
|
||||
find_link = toot_text.find(sub_str)
|
||||
if find_link != -1:
|
||||
|
||||
tuit_text = toot_text[:toot_text.index(sub_str)]
|
||||
|
||||
else:
|
||||
|
||||
tuit_text = toot_text
|
||||
|
||||
links_lst = ''
|
||||
for links in soup.find_all('a'):
|
||||
find_tag = links.get('href').find('/tags/')
|
||||
if find_tag == -1:
|
||||
links_lst += links.get('href')
|
||||
|
||||
if len(links_lst) > 0:
|
||||
|
||||
last_text = toot_text[len(tuit_text) + len(links_lst):]
|
||||
|
||||
else:
|
||||
|
||||
last_text = ''
|
||||
|
||||
tuit_text = f'{tuit_text} {links_lst} {last_text}'
|
||||
html2text.hn = lambda _:0
|
||||
h = html2text.HTML2Text()
|
||||
h.images_to_alt = True
|
||||
h.single_line_break = True
|
||||
h.ignore_emphasis = True
|
||||
h.ignore_links = True
|
||||
h.ignore_tables = True
|
||||
tuit_text = h.handle(title)
|
||||
|
||||
return tuit_text
|
||||
|
||||
|
@ -337,7 +316,7 @@ if __name__ == '__main__':
|
|||
|
||||
if publish:
|
||||
|
||||
tuit_text = get_toot(title)
|
||||
tuit_text = get_toot_text(title)
|
||||
|
||||
print("Tooting...")
|
||||
print(tuit_text)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
wheel>=0.37.0
|
||||
psycopg2>=2.9.1
|
||||
feedparser>=6.0.8
|
||||
bs4>=0.0.1
|
||||
html2text>=2020.1.16
|
||||
Mastodon.py>=1.5.1
|
||||
tweepy==4.1.0
|
||||
filetype>=1.0.8
|
||||
|
|
Loading…
Referencia en una nova incidència