Automatically follow http->https redirects
authorMagnus Hagander <magnus@hagander.net>
Mon, 19 Mar 2018 11:26:04 +0000 (12:26 +0100)
committerMagnus Hagander <magnus@hagander.net>
Mon, 19 Mar 2018 11:26:04 +0000 (12:26 +0100)
We only follow the redirect if the URL is *exactly* the same except it
has https intead of http in it. But this is a very common scenario these
days, so let's simplify that one.

In particular, it makes no sense to re-moderate a blog after that change
has been made.

hamnadmin/hamnadmin/register/forms.py
hamnadmin/hamnadmin/register/management/commands/aggregate_feeds.py
hamnadmin/hamnadmin/util/aggregate.py

index e72f710ddb14e682cb2cf5905b8e0932413965dc..e45f7d9929fb01cd38f28e1e625346c753065d80 100644 (file)
@@ -5,7 +5,7 @@ from django.conf import settings
 
 from models import Blog
 
-from hamnadmin.util.aggregate import FeedFetcher
+from hamnadmin.util.aggregate import FeedFetcher, ParserGotRedirect
 
 import urllib
 import requests
@@ -41,6 +41,8 @@ class BlogEditForm(forms.ModelForm):
                fetcher = FeedFetcher(feedobj, _trace, False)
                try:
                        entries = list(fetcher.parse())
+               except ParserGotRedirect:
+                       raise forms.ValidationError("This URL returns a permanent redirect")
                except Exception, e:
                        raise forms.ValidationError("Failed to retreive and parse feed: %s" % e)
                if len(entries) == 0:
index e9256146e57f6713da589c13a9c1e7ef8048d6e1..cc50e663eddc4166f544ac639495f84646815b18 100644 (file)
@@ -5,10 +5,11 @@ import gevent
 
 from django.core.management.base import BaseCommand, CommandError
 from django.db import transaction
+from django.db.models import Q
 from django.conf import settings
 
 from hamnadmin.register.models import Blog, Post, AggregatorLog
-from hamnadmin.util.aggregate import FeedFetcher
+from hamnadmin.util.aggregate import FeedFetcher, ParserGotRedirect
 from hamnadmin.mailqueue.util import send_simple_mail
 from hamnadmin.util.varnish import purge_root_and_feeds
 
@@ -60,7 +61,39 @@ class Command(BaseCommand):
                try:
                        with transaction.atomic():
                                for feed, results in pr.get():
-                                       if isinstance(results, Exception):
+                                       if isinstance(results, ParserGotRedirect):
+                                               # Received a redirect. If this is a redirect for exactly the same URL just
+                                               # from http to https, special case this and allow it. For any other redirect,
+                                               # we don't follow it since it might no longer be a properly filtered feed
+                                               # for example.
+                                               if results.url == feed.feedurl:
+                                                       # Redirect to itself! Should never happen, of course.
+                                                       AggregatorLog(feed=feed, success=False,
+                                                                                 info="Feed returned redirect loop to itself!").save()
+                                               elif results.url == feed.feedurl.replace('http://', 'https://'):
+                                                       # OK, update it!
+                                                       AggregatorLog(feed=feed, success=True,
+                                                                                 info="Feed returned redirect to https, updating registration").save()
+                                                       send_simple_mail(settings.EMAIL_SENDER,
+                                                                                        feed.user.email,
+                                                                                        "Your blog at Planet PostgreSQL redirected",
+                                                                                        u"The blog aggregator at Planet PostgreSQL has picked up a redirect for your blog.\nOld URL: {0}\nNew URL: {1}\n\nThe database has been updated, and new entries will be fetched from the secure URL in the future.\n".format(feed.feedurl, results.url),
+                                                                                        sendername="Planet PostgreSQL",
+                                                                                        receivername=u"{0} {1}".format(feed.user.first_name, feed.user.last_name),
+                                                                                        )
+                                                       send_simple_mail(settings.EMAIL_SENDER,
+                                                                                        settings.NOTIFICATION_RECEIVER,
+                                                                                        "Blog redirect detected on Planet PostgreSQL",
+                                                                                        u"The blog at {0} by {1}\nis returning a redirect to a https version of itself.\n\nThe database has automatically been updated, and will start fetching using https in the future,\n\n".format(feed.feedurl, feed.user),
+                                                                                        sendername="Planet PostgreSQL",
+                                                                                        receivername="Planet PostgreSQL Moderators",
+                                                       )
+                                                       feed.feedurl = results.url
+                                                       feed.save()
+                                               else:
+                                                       AggregatorLog(feed=feed, success=False,
+                                                                                 info="Feed returned redirect (http 301)").save()
+                                       elif isinstance(results, Exception):
                                                AggregatorLog(feed=feed,
                                                                          success=False,
                                                                          info=results).save()
@@ -162,6 +195,8 @@ class Command(BaseCommand):
                        self.trace("Fetching %s since %s" % (fetcher.feed.feedurl, since))
                try:
                        entries = list(fetcher.parse(since))
+               except ParserGotRedirect, e:
+                       return (fetcher.feed, e)
                except Exception, e:
                        self.stderr.write("Failed to fetch '%s': %s" % (fetcher.feed.feedurl, e))
                        return (fetcher.feed, e)
index 51950e6bb9d5c0b4073d94b251c278865e3f76d1..81f8c6918098a3b865f6ab55b4db88a0242012ba 100644 (file)
@@ -7,6 +7,11 @@ import feedparser
 
 from hamnadmin.register.models import Post
 
+class ParserGotRedirect(Exception):
+       def __init__(self, url):
+               self.url = url
+               super(Exception, self).__init__()
+
 class FeedFetcher(object):
        def __init__(self, feed, tracefunc=None, update=True):
                self.feed = feed
@@ -39,8 +44,12 @@ class FeedFetcher(object):
                        # Not modified
                        return
 
+               if parser.status == 301 and hasattr(parser, 'href'):
+                       # Permanent redirect. Bubble this up with an exception and let the caller
+                       # handle it.
+                       raise ParserGotRedirect(parser.href)
+
                if parser.status != 200:
-                       # XXX: follow redirect?
                        raise Exception('Feed returned status %s' % parser.status)
 
                self._trace("Fetched %s, status %s" % (self.feed.feedurl, parser.status))