From d4b37995fea40320c1971ea8bd747fc9ece9c368 Mon Sep 17 00:00:00 2001 From: Lorenz Diener Date: Fri, 16 Jun 2017 01:23:19 +0200 Subject: [PATCH] Pagination --- docs/index.rst | 37 +++++++++++++-- mastodon/Mastodon.py | 104 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 5 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index de14554..48cea67 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -73,6 +73,28 @@ a loop without ever sleeping at all yourself. It is for applications that would just pretend there is no such thing as a rate limit and are fine with sometimes not being very interactive. +A note about pagination +----------------------- +Many of Mastodons API endpoints are paginated. What this means is that if you request +data from them, you might not get all the data at once - instead, you might only get the +first few results. + +All endpoints that are paginated have three parameters: since_id, max_id and limit. +since_id allows you to specify the smallest id you want in the returned data. max_id, +similarly, allows you to specify the largest. By specifying either one (generally, +only one, not both) of them you can go through pages forwards and backwards. + +limit allows you to specify how many results you would like returned. Note that an +instance may choose to return less results than you requested. + +The responses returned by paginated endpoints contain a "link" header that specifies +which parameters to use to get the next and previous pages. Mastodon.py parses these +and stores them (if present) in the first (for the previous page) and last (for the +next page) item of the returned list as _pagination_prev and _pagination_next. + +There are convenience functions available for fetching the previous and next page of +a paginated request as well as for fetching all pages starting from a first page. + A note about IDs ---------------- Mastodons API uses IDs in several places: User IDs, Toot IDs, ... @@ -257,8 +279,9 @@ you can simply pass them to the constructor of the class, too! Note that while it is perfectly reasonable to log back in whenever your app starts, registering a new application on every startup is not, so don't do that - instead, register an application -once, and then persist your client id and secret. Convenience -methods for this are provided. +once, and then persist your client id and secret. A convenient method +for this is provided by the functions dealing with registering the app, +logging in and the Mastodon classes constructor. To talk to an instance different from the flagship instance, specify the api_base_url (usually, just the URL of the instance, i.e. @@ -405,12 +428,20 @@ Writing data: Reports Writing data: Domain blocks --------------------------- -These methods allow you to block and unblock all statuses from a domain +These functions allow you to block and unblock all statuses from a domain for the logged-in user. .. automethod:: Mastodon.domain_block .. automethod:: Mastodon.domain_unblock +Pagination +---------- +These functions allow for convenient retrieval of paginated data. + +.. automethod:: Mastodon.fetch_next +.. automethod:: Mastodon.fetch_previous +.. automethod:: Mastodon.fetch_remaining + Streaming --------- These functions allow access to the streaming API. diff --git a/mastodon/Mastodon.py b/mastodon/Mastodon.py index be6ea3f..26294f7 100644 --- a/mastodon/Mastodon.py +++ b/mastodon/Mastodon.py @@ -14,10 +14,12 @@ import requests from requests.models import urlencode import dateutil import dateutil.parser +import re +import copy class Mastodon: """ - Super basic but thorough and easy to use mastodon.social + Super basic but thorough and easy to use Mastodon api wrapper in python. If anything is unclear, check the official API docs at @@ -743,6 +745,76 @@ class Mastodon: params = self.__generate_params(locals()) return self.__api_request('DELETE', '/api/v1/domain_blocks', params) + ### + # Pagination + ### + def fetch_next(self, previous_page): + """ + Fetches the next page of results of a paginated request. Pass in the + previous page in its entirety, or the pagination information dict + returned as a part of that pages last status ('_pagination_next'). + + Returns the next page or None if no further data is available. + """ + if isinstance(previous_page, list): + if '_pagination_next' in previous_page[-1]: + params = previous_page[-1]['_pagination_next'] + else: + return None + else: + params = previous_page + + method = params['_pagination_method'] + del params['_pagination_method'] + + endpoint = params['_pagination_endpoint'] + del params['_pagination_endpoint'] + + return self.__api_request(method, endpoint, params) + + def fetch_previous(self, next_page): + """ + Fetches the previous page of results of a paginated request. Pass in the + previous page in its entirety, or the pagination information dict + returned as a part of that pages first status ('_pagination_prev'). + + Returns the previous page or None if no further data is available. + """ + if isinstance(next_page, list): + if '_pagination_prev' in next_page[-1]: + params = next_page[-1]['_pagination_prev'] + else: + return None + else: + params = next_page + + method = params['_pagination_method'] + del params['_pagination_method'] + + endpoint = params['_pagination_endpoint'] + del params['_pagination_endpoint'] + + return self.__api_request(method, endpoint, params) + + def fetch_remaining(self, first_page): + """ + Fetches all the remaining pages of a paginated request starting from a + first page and returns the entire set of results (including the first page + that was passed in) as a big list. + + Be careful, as this might generate a lot of requests, depending on what you are + fetching, and might cause you to run into rate limits very quickly. + """ + first_page = copy.deepcopy(first_page) + + all_pages = [] + current_page = first_page + while current_page != None: + all_pages.extend(current_page) + current_page = self.fetch_next(current_page) + + return all_pages + ### # Streaming ### @@ -786,7 +858,7 @@ class Mastodon: incoming events. """ return self.__stream('/api/v1/streaming/hashtag', listener, params={'tag': tag}) - + ### # Internal helpers, dragons probably ### @@ -884,6 +956,34 @@ class Mastodon: except: raise MastodonAPIError("Could not parse response as JSON, response code was %s, bad json content was '%s'" % (response_object.status_code, response_object.content)) + # Parse link headers + if isinstance(response, list) and 'Link' in response_object.headers: + tmp_urls = requests.utils.parse_header_links(response_object.headers['Link'].rstrip('>').replace('>,<', ',<')) + for url in tmp_urls: + if url['rel'] == 'next': + # Be paranoid and extract max_id specifically + next_url = url['url'] + matchgroups = re.search(r"max_id=([0-9]*)", next_url) + + if matchgroups: + next_params = copy.deepcopy(params) + next_params['_pagination_method'] = method + next_params['_pagination_endpoint'] = endpoint + next_params['max_id'] = int(matchgroups.group(1)) + response[-1]['_pagination_next'] = next_params + + if url['rel'] == 'prev': + # Be paranoid and extract since_id specifically + prev_url = url['url'] + matchgroups = re.search(r"since_id=([0-9]*)", prev_url) + + if matchgroups: + prev_params = copy.deepcopy(params) + prev_params['_pagination_method'] = method + prev_params['_pagination_endpoint'] = endpoint + prev_params['max_id'] = int(matchgroups.group(1)) + response[0]['_pagination_prev'] = prev_params + # Handle rate limiting if 'X-RateLimit-Remaining' in response_object.headers and do_ratelimiting: self.ratelimit_remaining = int(response_object.headers['X-RateLimit-Remaining'])