From d4b37995fea40320c1971ea8bd747fc9ece9c368 Mon Sep 17 00:00:00 2001
From: Lorenz Diener <lorenzd@gmail.com>
Date: Fri, 16 Jun 2017 01:23:19 +0200
Subject: [PATCH] Pagination

---
 docs/index.rst       |  37 +++++++++++++--
 mastodon/Mastodon.py | 104 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 136 insertions(+), 5 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index de14554..48cea67 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -73,6 +73,28 @@ a loop without ever sleeping at all yourself. It is for applications that would
 just pretend there is no such thing as a rate limit and are fine with sometimes not
 being very interactive.
 
+A note about pagination
+-----------------------
+Many of Mastodons API endpoints are paginated. What this means is that if you request
+data from them, you might not get all the data at once - instead, you might only get the
+first few results.
+
+All endpoints that are paginated have three parameters: since_id, max_id and limit.
+since_id allows you to specify the smallest id you want in the returned data. max_id,
+similarly, allows you to specify the largest. By specifying either one (generally,
+only one, not both) of them you can go through pages forwards and backwards.
+
+limit allows you to specify how many results you would like returned. Note that an
+instance may choose to return less results than you requested.
+
+The responses returned by paginated endpoints contain a "link" header that specifies
+which parameters to use to get the next and previous pages. Mastodon.py parses these
+and stores them (if present) in the first (for the previous page) and last (for the 
+next page) item of the returned list as _pagination_prev and _pagination_next.
+
+There are convenience functions available for fetching the previous and next page of
+a paginated request as well as for fetching all pages starting from a first page.
+
 A note about IDs
 ----------------
 Mastodons API uses IDs in several places: User IDs, Toot IDs, ...
@@ -257,8 +279,9 @@ you can simply pass them to the constructor of the class, too!
 Note that while it is perfectly reasonable to log back in whenever 
 your app starts, registering a new application on every 
 startup is not, so don't do that - instead, register an application 
-once, and then persist your client id and secret. Convenience
-methods for this are provided.
+once, and then persist your client id and secret. A convenient method
+for this is provided by the functions dealing with registering the app,
+logging in and the Mastodon classes constructor.
 
 To talk to an instance different from the flagship instance, specify
 the api_base_url (usually, just the URL of the instance, i.e. 
@@ -405,12 +428,20 @@ Writing data: Reports
 
 Writing data: Domain blocks
 ---------------------------
-These methods allow you to block and unblock all statuses from a domain
+These functions allow you to block and unblock all statuses from a domain
 for the logged-in user.
 
 .. automethod:: Mastodon.domain_block
 .. automethod:: Mastodon.domain_unblock
 
+Pagination
+----------
+These functions allow for convenient retrieval of paginated data.
+
+.. automethod:: Mastodon.fetch_next
+.. automethod:: Mastodon.fetch_previous
+.. automethod:: Mastodon.fetch_remaining
+
 Streaming
 ---------
 These functions allow access to the streaming API.
diff --git a/mastodon/Mastodon.py b/mastodon/Mastodon.py
index be6ea3f..26294f7 100644
--- a/mastodon/Mastodon.py
+++ b/mastodon/Mastodon.py
@@ -14,10 +14,12 @@ import requests
 from requests.models import urlencode
 import dateutil
 import dateutil.parser
+import re
+import copy
 
 class Mastodon:
     """
-    Super basic but thorough and easy to use mastodon.social
+    Super basic but thorough and easy to use Mastodon
     api wrapper in python.
 
     If anything is unclear, check the official API docs at
@@ -743,6 +745,76 @@ class Mastodon:
         params = self.__generate_params(locals())
         return self.__api_request('DELETE', '/api/v1/domain_blocks', params)
     
+    ###
+    # Pagination
+    ###
+    def fetch_next(self, previous_page):
+        """
+        Fetches the next page of results of a paginated request. Pass in the
+        previous page in its entirety, or the pagination information dict 
+        returned as a part of that pages last status ('_pagination_next').
+        
+        Returns the next page or None if no further data is available.
+        """
+        if isinstance(previous_page, list):
+            if '_pagination_next' in previous_page[-1]:
+                params = previous_page[-1]['_pagination_next']
+            else:
+                return None
+        else:
+            params = previous_page
+        
+        method = params['_pagination_method']
+        del params['_pagination_method']
+        
+        endpoint = params['_pagination_endpoint']
+        del params['_pagination_endpoint']
+        
+        return self.__api_request(method, endpoint, params)
+    
+    def fetch_previous(self, next_page):
+        """
+        Fetches the previous page of results of a paginated request. Pass in the
+        previous page in its entirety, or the pagination information dict 
+        returned as a part of that pages first status ('_pagination_prev').
+        
+        Returns the previous page or None if no further data is available.
+        """
+        if isinstance(next_page, list):
+            if '_pagination_prev' in next_page[-1]:
+                params = next_page[-1]['_pagination_prev']
+            else:
+                return None
+        else:
+            params = next_page
+        
+        method = params['_pagination_method']
+        del params['_pagination_method']
+        
+        endpoint = params['_pagination_endpoint']
+        del params['_pagination_endpoint']
+        
+        return self.__api_request(method, endpoint, params)
+    
+    def fetch_remaining(self, first_page):
+        """
+        Fetches all the remaining pages of a paginated request starting from a 
+        first page and returns the entire set of results (including the first page
+        that was passed in) as a big list.
+        
+        Be careful, as this might generate a lot of requests, depending on what you are
+        fetching, and might cause you to run into rate limits very quickly.
+        """
+        first_page = copy.deepcopy(first_page)
+        
+        all_pages = []
+        current_page = first_page
+        while current_page != None:
+            all_pages.extend(current_page)
+            current_page = self.fetch_next(current_page)
+            
+        return all_pages
+    
     ###
     # Streaming
     ###
@@ -786,7 +858,7 @@ class Mastodon:
         incoming events.
         """
         return self.__stream('/api/v1/streaming/hashtag', listener, params={'tag': tag})
-
+    
     ###
     # Internal helpers, dragons probably
     ###
@@ -884,6 +956,34 @@ class Mastodon:
             except:
                 raise MastodonAPIError("Could not parse response as JSON, response code was %s, bad json content was '%s'" % (response_object.status_code, response_object.content))
 
+            # Parse link headers
+            if isinstance(response, list) and 'Link' in response_object.headers:
+                tmp_urls = requests.utils.parse_header_links(response_object.headers['Link'].rstrip('>').replace('>,<', ',<'))   
+                for url in tmp_urls:
+                    if url['rel'] == 'next':
+                        # Be paranoid and extract max_id specifically
+                        next_url = url['url']
+                        matchgroups = re.search(r"max_id=([0-9]*)", next_url)
+                        
+                        if matchgroups:
+                            next_params = copy.deepcopy(params)
+                            next_params['_pagination_method'] = method
+                            next_params['_pagination_endpoint'] = endpoint
+                            next_params['max_id'] = int(matchgroups.group(1))
+                            response[-1]['_pagination_next'] = next_params
+                            
+                    if url['rel'] == 'prev':
+                        # Be paranoid and extract since_id specifically
+                        prev_url = url['url']
+                        matchgroups = re.search(r"since_id=([0-9]*)", prev_url)
+                        
+                        if matchgroups:
+                            prev_params = copy.deepcopy(params)
+                            prev_params['_pagination_method'] = method
+                            prev_params['_pagination_endpoint'] = endpoint
+                            prev_params['max_id'] = int(matchgroups.group(1))
+                            response[0]['_pagination_prev'] = prev_params
+                
             # Handle rate limiting
             if 'X-RateLimit-Remaining' in response_object.headers and do_ratelimiting:
                 self.ratelimit_remaining = int(response_object.headers['X-RateLimit-Remaining'])