fediverse/fetchservers.py

362 líneas
9,3 KiB
Python
Original Vista normal Històric

2022-03-02 22:26:05 +01:00
from multiprocessing import set_start_method
from multiprocessing import get_context
from itertools import product
2021-05-14 10:39:26 +02:00
import time
from datetime import datetime
import os
import json
import sys
import os.path
import psycopg2
import aiohttp
import asyncio
import socket
2022-03-02 22:26:05 +01:00
import pdb
2021-05-14 10:39:26 +02:00
client_exceptions = (
aiohttp.ClientResponseError,
aiohttp.ClientConnectionError,
aiohttp.ClientConnectorError,
aiohttp.ClientError,
asyncio.TimeoutError,
socket.gaierror,
)
2022-03-02 22:26:05 +01:00
def write_api(server, software, users, alive, api, soft_version):
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
now = datetime.now()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
fediverse_db, fediverse_db_user = get_db_config()
2021-05-14 10:39:26 +02:00
insert_sql = "INSERT INTO fediverse(server, updated_at, software, users, alive, users_api, version) VALUES(%s,%s,%s,%s,%s,%s,%s) ON CONFLICT DO NOTHING"
2021-05-14 10:39:26 +02:00
conn = None
try:
2022-03-02 22:26:05 +01:00
conn = psycopg2.connect(database=fediverse_db, user=fediverse_db_user, password="", host="/var/run/postgresql", port="5432")
2021-05-14 10:39:26 +02:00
cur = conn.cursor()
2022-03-02 22:26:05 +01:00
print(f'Writing {server} nodeinfo data...')
2021-05-14 10:39:26 +02:00
cur.execute(insert_sql, (server, now, software, users, alive, api, soft_version))
2021-05-14 13:44:19 +02:00
cur.execute(
"UPDATE fediverse SET updated_at=(%s), software=(%s), users=(%s), alive=(%s), users_api=(%s), version=(%s) where server=(%s)",
(now, software, users, alive, api, soft_version, server)
)
2021-05-14 10:39:26 +02:00
cur.execute("UPDATE world SET checked='t' where server=(%s)", (server,))
conn.commit()
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
2022-03-02 22:26:05 +01:00
2021-05-14 10:39:26 +02:00
conn.close()
2021-05-14 13:44:19 +02:00
async def getsoft(server):
2022-03-02 22:26:05 +01:00
fediverse_db, fediverse_db_user = get_db_config()
2021-05-14 10:39:26 +02:00
try:
socket.gethostbyname(server)
2022-03-02 22:26:05 +01:00
except socket.gaierror as g_error:
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
print(f'Server {server} error: {g_error}')
2021-05-14 10:39:26 +02:00
pass
return
soft = ''
url = 'https://' + server
user_agent = {'User-agent': "fediverse's stats (fediverse@mastodont.cat)"}
2021-05-14 10:39:26 +02:00
timeout = aiohttp.ClientTimeout(total=3)
2022-03-02 22:26:05 +01:00
async with aiohttp.ClientSession(timeout=timeout, headers=user_agent) as session:
2022-03-02 22:26:05 +01:00
try:
2022-03-02 22:26:05 +01:00
async with session.get(url + '/.well-known/nodeinfo') as response:
2022-03-02 22:26:05 +01:00
if response.status == 200:
2022-03-02 22:26:05 +01:00
try:
2022-03-02 22:26:05 +01:00
response_json = await response.json()
nodeinfo = response_json['links'][0]['href'].replace(f'https://{server}','')
except:
pass
else:
2022-03-02 22:26:05 +01:00
print(f'Server {server} not responding: {response.status}')
2022-03-02 22:26:05 +01:00
pass
async with session.get(url + nodeinfo) as nodeinfo_response:
if nodeinfo_response.status == 200:
try:
2022-03-02 22:26:05 +01:00
nodeinfo_json = await nodeinfo_response.json()
except:
2022-03-02 22:26:05 +01:00
pass
2022-03-02 22:26:05 +01:00
else:
2022-03-02 22:26:05 +01:00
print(f"Server {server}'s nodeinfo not responding: {response.status}")
pass
except aiohttp.ClientConnectorError as cc_err:
pass
except aiohttp.client_exceptions.ClientConnectorSSLError as ccssl_as:
pass
else:
if nodeinfo_response.status == 200 and nodeinfo != '/api/v1/instance?':
if nodeinfo != '/.well-known/x-nodeinfo2?':
try:
soft = nodeinfo_json['software']['name']
soft = soft.lower()
soft_version = nodeinfo_json['software']['version']
users = nodeinfo_json['usage']['users']['total']
if users > 1000000:
return
2022-03-02 22:26:05 +01:00
alive = True
write_api(server, soft, users, alive, nodeinfo, soft_version)
print('*********************************************************************')
print("Server " + server + " (" + soft + " " + soft_version + ") is alive!")
print('*********************************************************************')
2021-05-14 13:44:19 +02:00
return
2022-03-02 22:26:05 +01:00
except:
pass
else:
try:
soft = nodeinfo_json['server']['software']
soft = soft.lower()
soft_version = nodeinfo_json['server']['version']
users = nodeinfo_json['usage']['users']['total']
if users > 1000000:
return
alive = True
if soft == 'socialhome':
write_api(server, soft, users, alive, api, soft_version)
print('*********************************************************************')
print("Server " + server + " (" + soft + " " + soft_version + ") is alive!")
print('*********************************************************************')
return
except:
pass
if nodeinfo_response.status == 200 and soft == '' and nodeinfo == "/api/v1/instance?":
soft = 'mastodon'
users = nodeinfo_json['stats']['user_count']
soft_version = nodeinfo_json['version']
if users > 1000000:
return
alive = True
write_api(server, soft, users, alive, api)
print('*********************************************************************')
print("Server " + server + " (" + soft + ") is alive!")
print('*********************************************************************')
else:
print(f'Server {server} is dead')
def getserver(server, *args):
if len(args) != 0:
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
server = server[0].rstrip('.').lower()
2021-05-14 10:39:26 +02:00
if server.find(".") == -1:
return
if server.find("@") != -1:
2021-05-14 13:44:19 +02:00
return
2021-05-14 10:39:26 +02:00
if server.find("/") != -1:
2021-05-14 13:44:19 +02:00
return
2021-05-14 10:39:26 +02:00
if server.find(":") != -1:
2021-05-14 13:44:19 +02:00
return
2021-05-14 10:39:26 +02:00
try:
loop = asyncio.get_event_loop()
coroutines = [getsoft(server)]
soft = loop.run_until_complete(asyncio.gather(*coroutines, return_exceptions=True))
except:
pass
2022-03-02 22:26:05 +01:00
def get_world_servers():
world_servers = []
try:
conn = None
conn = psycopg2.connect(database=fediverse_db, user=fediverse_db_user, password="", host="/var/run/postgresql", port="5432")
cur = conn.cursor()
# get world servers list
cur.execute("select server from world where checked='f'")
rows = cur.fetchall()
for row in rows:
world_servers.append(row[0])
cur.close()
print("Remaining servers: " + str(len(world_servers)))
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
return world_servers
2021-05-14 13:44:19 +02:00
def get_parameter(parameter, file_path):
2022-03-02 22:26:05 +01:00
2021-05-14 10:39:26 +02:00
# Check if secrets file exists
if not os.path.isfile(file_path):
2021-05-14 13:44:19 +02:00
print("File %s not found, exiting." % file_path)
2021-05-14 10:39:26 +02:00
sys.exit(0)
# Find parameter in file
2021-05-14 13:44:19 +02:00
with open(file_path) as f:
2021-05-14 10:39:26 +02:00
for line in f:
2021-05-14 13:44:19 +02:00
if line.startswith(parameter):
2021-05-14 10:39:26 +02:00
return line.replace(parameter + ":", "").strip()
# Cannot find parameter, exit
2021-05-14 13:44:19 +02:00
print(file_path + " Missing parameter %s " % parameter)
2021-05-14 10:39:26 +02:00
sys.exit(0)
2022-03-02 22:26:05 +01:00
def usage():
print('usage: python ' + sys.argv[0] + ' --multi' + ' (multiprocessing, fast)')
print('usage: python ' + sys.argv[0] + ' --mono' + ' (one process, slow)')
2021-05-14 10:39:26 +02:00
def get_config():
# Load configuration from config file
config_filepath = "config/config.txt"
mastodon_hostname = get_parameter("mastodon_hostname", config_filepath)
return mastodon_hostname
2021-05-14 10:39:26 +02:00
def get_db_config():
# Load database config from db_config file
db_config_filepath = "config/db_config.txt"
fediverse_db = get_parameter("fediverse_db", db_config_filepath)
fediverse_db_user = get_parameter("fediverse_db_user", db_config_filepath)
return (fediverse_db, fediverse_db_user)
2021-05-14 10:39:26 +02:00
###############################################################################
# main
if __name__ == '__main__':
2021-05-14 13:44:19 +02:00
2022-03-02 22:26:05 +01:00
# usage modes
2022-03-02 22:26:05 +01:00
if len(sys.argv) == 1:
2022-03-02 22:26:05 +01:00
usage()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
elif len(sys.argv) == 2:
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
if sys.argv[1] == '--multi':
2021-05-14 13:44:19 +02:00
2022-03-02 22:26:05 +01:00
now = datetime.now()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
start_time = time.time()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
mastodon_hostname = get_config()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
fediverse_db, fediverse_db_user = get_db_config()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
world_servers = get_world_servers()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
with get_context("spawn").Pool(processes=32) as pool:
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
res = pool.starmap(getserver, product(world_servers))
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
pool.close()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
pool.join()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
print('Done.')
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
elif sys.argv[1] == '--mono':
2022-03-02 22:26:05 +01:00
now = datetime.now()
start_time = time.time()
mastodon_hostname = get_config()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
fediverse_db, fediverse_db_user = get_db_config()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
world_servers = get_world_servers()
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
for server in world_servers:
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
getserver(server)
2021-05-14 10:39:26 +02:00
2022-03-02 22:26:05 +01:00
print('Done.')