import time
class AuthBackend(ModelBackend):
- # We declare a fake backend that always fails direct authentication -
- # since we should never be using direct authentication in the first place!
- def authenticate(self, username=None, password=None):
- raise Exception("Direct authentication not supported")
+ # We declare a fake backend that always fails direct authentication -
+ # since we should never be using direct authentication in the first place!
+ def authenticate(self, username=None, password=None):
+ raise Exception("Direct authentication not supported")
####
# Handle login requests by sending them off to the main site
def login(request):
- if not hasattr(settings, 'PGAUTH_REDIRECT'):
- # No pgauth installed, so allow local installs.
- from django.contrib.auth.views import login
- return login(request, template_name='admin.html')
-
- if 'next' in request.GET:
- # Put together an url-encoded dict of parameters we're getting back,
- # including a small nonce at the beginning to make sure it doesn't
- # encrypt the same way every time.
- s = "t=%s&%s" % (int(time.time()), urlencode({'r': request.GET['next']}))
- # Now encrypt it
- r = Random.new()
- iv = r.read(16)
- encryptor = AES.new(SHA.new(settings.SECRET_KEY.encode('ascii')).digest()[:16], AES.MODE_CBC, iv)
- cipher = encryptor.encrypt(s + ' ' * (16-(len(s) % 16))) # pad to 16 bytes
-
- return HttpResponseRedirect("%s?d=%s$%s" % (
- settings.PGAUTH_REDIRECT,
- base64.b64encode(iv, b"-_").decode('utf8'),
- base64.b64encode(cipher, b"-_").decode('utf8'),
- ))
- else:
- return HttpResponseRedirect(settings.PGAUTH_REDIRECT)
+ if not hasattr(settings, 'PGAUTH_REDIRECT'):
+ # No pgauth installed, so allow local installs.
+ from django.contrib.auth.views import login
+ return login(request, template_name='admin.html')
+
+ if 'next' in request.GET:
+ # Put together an url-encoded dict of parameters we're getting back,
+ # including a small nonce at the beginning to make sure it doesn't
+ # encrypt the same way every time.
+ s = "t=%s&%s" % (int(time.time()), urlencode({'r': request.GET['next']}))
+ # Now encrypt it
+ r = Random.new()
+ iv = r.read(16)
+ encryptor = AES.new(SHA.new(settings.SECRET_KEY.encode('ascii')).digest()[:16], AES.MODE_CBC, iv)
+ cipher = encryptor.encrypt(s + ' ' * (16-(len(s) % 16))) # pad to 16 bytes
+
+ return HttpResponseRedirect("%s?d=%s$%s" % (
+ settings.PGAUTH_REDIRECT,
+ base64.b64encode(iv, b"-_").decode('utf8'),
+ base64.b64encode(cipher, b"-_").decode('utf8'),
+ ))
+ else:
+ return HttpResponseRedirect(settings.PGAUTH_REDIRECT)
# Handle logout requests by logging out of this site and then
# redirecting to log out from the main site as well.
def logout(request):
- if request.user.is_authenticated():
- django_logout(request)
- return HttpResponseRedirect("%slogout/" % settings.PGAUTH_REDIRECT)
+ if request.user.is_authenticated():
+ django_logout(request)
+ return HttpResponseRedirect("%slogout/" % settings.PGAUTH_REDIRECT)
# Receive an authentication response from the main website and try
# to log the user in.
def auth_receive(request):
- if 's' in request.GET and request.GET['s'] == "logout":
- # This was a logout request
- return HttpResponseRedirect('/')
-
- if 'i' not in request.GET:
- return HttpResponse("Missing IV in url!", status=400)
- if 'd' not in request.GET:
- return HttpResponse("Missing data in url!", status=400)
-
- # Set up an AES object and decrypt the data we received
- decryptor = AES.new(base64.b64decode(settings.PGAUTH_KEY),
- AES.MODE_CBC,
- base64.b64decode(str(request.GET['i']), "-_"))
- s = decryptor.decrypt(base64.b64decode(str(request.GET['d']), "-_")).rstrip(b' ').decode('utf8')
-
- # Now un-urlencode it
- try:
- data = parse_qs(s, strict_parsing=True)
- except ValueError:
- return HttpResponse("Invalid encrypted data received.", status=400)
-
- # Check the timestamp in the authentication
- if (int(data['t'][0]) < time.time() - 10):
- return HttpResponse("Authentication token too old.", status=400)
-
- # Update the user record (if any)
- try:
- user = User.objects.get(username=data['u'][0])
- # User found, let's see if any important fields have changed
- changed = False
- if user.first_name != data['f'][0]:
- user.first_name = data['f'][0]
- changed = True
- if user.last_name != data['l'][0]:
- user.last_name = data['l'][0]
- changed = True
- if user.email != data['e'][0]:
- user.email = data['e'][0]
- changed= True
- if changed:
- user.save()
- except User.DoesNotExist:
- # User not found, create it!
-
- # NOTE! We have some legacy users where there is a user in
- # the database with a different userid. Instead of trying to
- # somehow fix that live, give a proper error message and
- # have somebody look at it manually.
- if User.objects.filter(email=data['e'][0]).exists():
- return HttpResponse("""A user with email %s already exists, but with
+ if 's' in request.GET and request.GET['s'] == "logout":
+ # This was a logout request
+ return HttpResponseRedirect('/')
+
+ if 'i' not in request.GET:
+ return HttpResponse("Missing IV in url!", status=400)
+ if 'd' not in request.GET:
+ return HttpResponse("Missing data in url!", status=400)
+
+ # Set up an AES object and decrypt the data we received
+ decryptor = AES.new(base64.b64decode(settings.PGAUTH_KEY),
+ AES.MODE_CBC,
+ base64.b64decode(str(request.GET['i']), "-_"))
+ s = decryptor.decrypt(base64.b64decode(str(request.GET['d']), "-_")).rstrip(b' ').decode('utf8')
+
+ # Now un-urlencode it
+ try:
+ data = parse_qs(s, strict_parsing=True)
+ except ValueError:
+ return HttpResponse("Invalid encrypted data received.", status=400)
+
+ # Check the timestamp in the authentication
+ if (int(data['t'][0]) < time.time() - 10):
+ return HttpResponse("Authentication token too old.", status=400)
+
+ # Update the user record (if any)
+ try:
+ user = User.objects.get(username=data['u'][0])
+ # User found, let's see if any important fields have changed
+ changed = False
+ if user.first_name != data['f'][0]:
+ user.first_name = data['f'][0]
+ changed = True
+ if user.last_name != data['l'][0]:
+ user.last_name = data['l'][0]
+ changed = True
+ if user.email != data['e'][0]:
+ user.email = data['e'][0]
+ changed= True
+ if changed:
+ user.save()
+ except User.DoesNotExist:
+ # User not found, create it!
+
+ # NOTE! We have some legacy users where there is a user in
+ # the database with a different userid. Instead of trying to
+ # somehow fix that live, give a proper error message and
+ # have somebody look at it manually.
+ if User.objects.filter(email=data['e'][0]).exists():
+ return HttpResponse("""A user with email %s already exists, but with
a different username than %s.
This is almost certainly caused by some legacy data in our database.
We apologize for the inconvenience.
""" % (data['e'][0], data['u'][0]), content_type='text/plain')
- if hasattr(settings, 'PGAUTH_CREATEUSER_CALLBACK'):
- res = getattr(settings, 'PGAUTH_CREATEUSER_CALLBACK')(
- data['u'][0],
- data['e'][0],
- ['f'][0],
- data['l'][0],
- )
- # If anything is returned, we'll return that as our result.
- # If None is returned, it means go ahead and create the user.
- if res:
- return res
-
- user = User(username=data['u'][0],
- first_name=data['f'][0],
- last_name=data['l'][0],
- email=data['e'][0],
- password='setbypluginnotasha1',
- )
- user.save()
-
- # Ok, we have a proper user record. Now tell django that
- # we're authenticated so it persists it in the session. Before
- # we do that, we have to annotate it with the backend information.
- user.backend = "%s.%s" % (AuthBackend.__module__, AuthBackend.__name__)
- django_login(request, user)
-
- # Finally, check of we have a data package that tells us where to
- # redirect the user.
- if 'd' in data:
- (ivs, datas) = data['d'][0].split('$')
- decryptor = AES.new(SHA.new(settings.SECRET_KEY.encode('ascii')).digest()[:16],
- AES.MODE_CBC,
- base64.b64decode(ivs, b"-_"))
- s = decryptor.decrypt(base64.b64decode(datas, "-_")).rstrip(b' ').decode('utf8')
- try:
- rdata = parse_qs(s, strict_parsing=True)
- except ValueError:
- return HttpResponse("Invalid encrypted data received.", status=400)
- if 'r' in rdata:
- # Redirect address
- return HttpResponseRedirect(rdata['r'][0])
- # No redirect specified, see if we have it in our settings
- if hasattr(settings, 'PGAUTH_REDIRECT_SUCCESS'):
- return HttpResponseRedirect(settings.PGAUTH_REDIRECT_SUCCESS)
- return HttpResponse("Authentication successful, but don't know where to redirect!", status=500)
+ if hasattr(settings, 'PGAUTH_CREATEUSER_CALLBACK'):
+ res = getattr(settings, 'PGAUTH_CREATEUSER_CALLBACK')(
+ data['u'][0],
+ data['e'][0],
+ ['f'][0],
+ data['l'][0],
+ )
+ # If anything is returned, we'll return that as our result.
+ # If None is returned, it means go ahead and create the user.
+ if res:
+ return res
+
+ user = User(username=data['u'][0],
+ first_name=data['f'][0],
+ last_name=data['l'][0],
+ email=data['e'][0],
+ password='setbypluginnotasha1',
+ )
+ user.save()
+
+ # Ok, we have a proper user record. Now tell django that
+ # we're authenticated so it persists it in the session. Before
+ # we do that, we have to annotate it with the backend information.
+ user.backend = "%s.%s" % (AuthBackend.__module__, AuthBackend.__name__)
+ django_login(request, user)
+
+ # Finally, check of we have a data package that tells us where to
+ # redirect the user.
+ if 'd' in data:
+ (ivs, datas) = data['d'][0].split('$')
+ decryptor = AES.new(SHA.new(settings.SECRET_KEY.encode('ascii')).digest()[:16],
+ AES.MODE_CBC,
+ base64.b64decode(ivs, b"-_"))
+ s = decryptor.decrypt(base64.b64decode(datas, "-_")).rstrip(b' ').decode('utf8')
+ try:
+ rdata = parse_qs(s, strict_parsing=True)
+ except ValueError:
+ return HttpResponse("Invalid encrypted data received.", status=400)
+ if 'r' in rdata:
+ # Redirect address
+ return HttpResponseRedirect(rdata['r'][0])
+ # No redirect specified, see if we have it in our settings
+ if hasattr(settings, 'PGAUTH_REDIRECT_SUCCESS'):
+ return HttpResponseRedirect(settings.PGAUTH_REDIRECT_SUCCESS)
+ return HttpResponse("Authentication successful, but don't know where to redirect!", status=500)
# Perform a search in the central system. Note that the results are returned as an
# Unlike the authentication, searching does not involve the browser - we just make
# a direct http call.
def user_search(searchterm=None, userid=None):
- # If upsteam isn't responding quickly, it's not going to respond at all, and
- # 10 seconds is already quite long.
- socket.setdefaulttimeout(10)
- if userid:
- q = {'u': userid}
- else:
- q = {'s': searchterm}
-
- u = urllib.request.urlopen('%ssearch/?%s' % (
- settings.PGAUTH_REDIRECT,
- urlencode(q),
- ))
- (ivs, datas) = u.read().split('&')
- u.close()
-
- # Decryption time
- decryptor = AES.new(base64.b64decode(settings.PGAUTH_KEY),
- AES.MODE_CBC,
- base64.b64decode(ivs, "-_"))
- s = decryptor.decrypt(base64.b64decode(datas, "-_")).rstrip(' ')
- j = json.loads(s)
-
- return j
+ # If upsteam isn't responding quickly, it's not going to respond at all, and
+ # 10 seconds is already quite long.
+ socket.setdefaulttimeout(10)
+ if userid:
+ q = {'u': userid}
+ else:
+ q = {'s': searchterm}
+
+ u = urllib.request.urlopen('%ssearch/?%s' % (
+ settings.PGAUTH_REDIRECT,
+ urlencode(q),
+ ))
+ (ivs, datas) = u.read().split('&')
+ u.close()
+
+ # Decryption time
+ decryptor = AES.new(base64.b64decode(settings.PGAUTH_KEY),
+ AES.MODE_CBC,
+ base64.b64decode(ivs, "-_"))
+ s = decryptor.decrypt(base64.b64decode(datas, "-_")).rstrip(' ')
+ j = json.loads(s)
+
+ return j
# Import a user into the local authentication system. Will initially
# make a search for it, and if anything other than one entry is returned
# The call to this function should normally be wrapped in a transaction,
# and this function itself will make no attempt to do anything about that.
def user_import(uid):
- u = user_search(userid=uid)
- if len(u) != 1:
- raise Exception("Internal error, duplicate or no user found")
+ u = user_search(userid=uid)
+ if len(u) != 1:
+ raise Exception("Internal error, duplicate or no user found")
- u = u[0]
+ u = u[0]
- if User.objects.filter(username=u['u']).exists():
- raise Exception("User already exists")
+ if User.objects.filter(username=u['u']).exists():
+ raise Exception("User already exists")
- User(username=u['u'],
- first_name=u['f'],
- last_name=u['l'],
- email=u['e'],
- password='setbypluginnotsha1',
- ).save()
+ User(username=u['u'],
+ first_name=u['f'],
+ last_name=u['l'],
+ email=u['e'],
+ password='setbypluginnotsha1',
+ ).save()
@cache(hours=4)
def listinfo(request):
- if not settings.PUBLIC_ARCHIVES:
- return HttpResponseForbidden('No API access on private archives for now')
+ if not settings.PUBLIC_ARCHIVES:
+ return HttpResponseForbidden('No API access on private archives for now')
- if not request.META['REMOTE_ADDR'] in settings.API_CLIENTS:
- return HttpResponseForbidden('Invalid host')
+ if not request.META['REMOTE_ADDR'] in settings.API_CLIENTS:
+ return HttpResponseForbidden('Invalid host')
- resp = HttpResponse(content_type='application/json')
- json.dump([{
- 'name': l.listname,
- 'shortdesc': l.shortdesc,
- 'description': l.description,
- 'active': l.active,
- 'group': l.group.groupname,
- } for l in List.objects.select_related('group').all()], resp)
+ resp = HttpResponse(content_type='application/json')
+ json.dump([{
+ 'name': l.listname,
+ 'shortdesc': l.shortdesc,
+ 'description': l.description,
+ 'active': l.active,
+ 'group': l.group.groupname,
+ } for l in List.objects.select_related('group').all()], resp)
- return resp
+ return resp
@cache(hours=4)
def latest(request, listname):
- if not settings.PUBLIC_ARCHIVES:
- return HttpResponseForbidden('No API access on private archives for now')
-
- if not request.META['REMOTE_ADDR'] in settings.API_CLIENTS:
- return HttpResponseForbidden('Invalid host')
-
- # Return the latest <n> messages on this list.
- # If <n> is not specified, return 50. Max value for <n> is 100.
- if 'n' in request.GET:
- try:
- limit = int(request.GET['n'])
- except:
- limit = 0
- else:
- limit = 50
- if limit <= 0 or limit > 100:
- limit = 50
-
- extrawhere=[]
- extraparams=[]
-
- # Return only messages that have attachments?
- if 'a' in request.GET:
- if request.GET['a'] == '1':
- extrawhere.append("has_attachment")
-
- # Restrict by full text search
- if 's' in request.GET and request.GET['s']:
- extrawhere.append("fti @@ plainto_tsquery('public.pg', %s)")
- extraparams.append(request.GET['s'])
-
- if listname != '*':
- list = get_object_or_404(List, listname=listname)
- extrawhere.append("threadid IN (SELECT threadid FROM list_threads WHERE listid=%s)" % list.listid)
- else:
- list = None
- extrawhere=''
-
- mlist = Message.objects.defer('bodytxt', 'cc', 'to').select_related().extra(where=extrawhere, params=extraparams).order_by('-date')[:limit]
- allyearmonths = set([(m.date.year, m.date.month) for m in mlist])
-
- resp = HttpResponse(content_type='application/json')
- json.dump([
- {'msgid': m.messageid,
- 'date': m.date.isoformat(),
- 'from': m.mailfrom,
- 'subj': m.subject,}
- for m in mlist], resp)
-
- # Make sure this expires from the varnish cache when new entries show
- # up in this month.
- # XXX: need to deal with the global view, but for now API callers come in directly
- if list:
- resp['X-pglm'] = ':%s:' % (':'.join(['%s/%s/%s' % (list.listid, year, month) for year, month in allyearmonths]))
- return resp
+ if not settings.PUBLIC_ARCHIVES:
+ return HttpResponseForbidden('No API access on private archives for now')
+
+ if not request.META['REMOTE_ADDR'] in settings.API_CLIENTS:
+ return HttpResponseForbidden('Invalid host')
+
+ # Return the latest <n> messages on this list.
+ # If <n> is not specified, return 50. Max value for <n> is 100.
+ if 'n' in request.GET:
+ try:
+ limit = int(request.GET['n'])
+ except:
+ limit = 0
+ else:
+ limit = 50
+ if limit <= 0 or limit > 100:
+ limit = 50
+
+ extrawhere=[]
+ extraparams=[]
+
+ # Return only messages that have attachments?
+ if 'a' in request.GET:
+ if request.GET['a'] == '1':
+ extrawhere.append("has_attachment")
+
+ # Restrict by full text search
+ if 's' in request.GET and request.GET['s']:
+ extrawhere.append("fti @@ plainto_tsquery('public.pg', %s)")
+ extraparams.append(request.GET['s'])
+
+ if listname != '*':
+ list = get_object_or_404(List, listname=listname)
+ extrawhere.append("threadid IN (SELECT threadid FROM list_threads WHERE listid=%s)" % list.listid)
+ else:
+ list = None
+ extrawhere=''
+
+ mlist = Message.objects.defer('bodytxt', 'cc', 'to').select_related().extra(where=extrawhere, params=extraparams).order_by('-date')[:limit]
+ allyearmonths = set([(m.date.year, m.date.month) for m in mlist])
+
+ resp = HttpResponse(content_type='application/json')
+ json.dump([
+ {'msgid': m.messageid,
+ 'date': m.date.isoformat(),
+ 'from': m.mailfrom,
+ 'subj': m.subject,}
+ for m in mlist], resp)
+
+ # Make sure this expires from the varnish cache when new entries show
+ # up in this month.
+ # XXX: need to deal with the global view, but for now API callers come in directly
+ if list:
+ resp['X-pglm'] = ':%s:' % (':'.join(['%s/%s/%s' % (list.listid, year, month) for year, month in allyearmonths]))
+ return resp
@cache(hours=4)
def thread(request, msgid):
- if not settings.PUBLIC_ARCHIVES:
- return HttpResponseForbidden('No API access on private archives for now')
-
- if not request.META['REMOTE_ADDR'] in settings.API_CLIENTS:
- return HttpResponseForbidden('Invalid host')
-
- # Return metadata about a single thread. A list of all the emails
- # that are in the thread with their basic attributes are included.
- msg = get_object_or_404(Message, messageid=msgid)
- mlist = Message.objects.defer('bodytxt', 'cc', 'to').filter(threadid=msg.threadid)
-
- resp = HttpResponse(content_type='application/json')
- json.dump([
- {'msgid': m.messageid,
- 'date': m.date.isoformat(),
- 'from': m.mailfrom,
- 'subj': m.subject,
- 'atts': [{'id': a.id, 'name': a.filename} for a in m.attachment_set.all()],
- }
- for m in mlist], resp)
- resp['X-pgthread'] = m.threadid
- return resp
+ if not settings.PUBLIC_ARCHIVES:
+ return HttpResponseForbidden('No API access on private archives for now')
+
+ if not request.META['REMOTE_ADDR'] in settings.API_CLIENTS:
+ return HttpResponseForbidden('Invalid host')
+
+ # Return metadata about a single thread. A list of all the emails
+ # that are in the thread with their basic attributes are included.
+ msg = get_object_or_404(Message, messageid=msgid)
+ mlist = Message.objects.defer('bodytxt', 'cc', 'to').filter(threadid=msg.threadid)
+
+ resp = HttpResponse(content_type='application/json')
+ json.dump([
+ {'msgid': m.messageid,
+ 'date': m.date.isoformat(),
+ 'from': m.mailfrom,
+ 'subj': m.subject,
+ 'atts': [{'id': a.id, 'name': a.filename} for a in m.attachment_set.all()],
+ }
+ for m in mlist], resp)
+ resp['X-pgthread'] = m.threadid
+ return resp
def thread_subscribe(request, msgid):
- if not settings.PUBLIC_ARCHIVES:
- return HttpResponseForbidden('No API access on private archives for now')
+ if not settings.PUBLIC_ARCHIVES:
+ return HttpResponseForbidden('No API access on private archives for now')
- if not request.META['REMOTE_ADDR'] in settings.API_CLIENTS:
- return HttpResponseForbidden('Invalid host')
+ if not request.META['REMOTE_ADDR'] in settings.API_CLIENTS:
+ return HttpResponseForbidden('Invalid host')
- if 'HTTP_X_APIKEY' not in request.META:
- return HttpResponseForbidden('No API key')
+ if 'HTTP_X_APIKEY' not in request.META:
+ return HttpResponseForbidden('No API key')
- if request.method != 'PUT':
- return HttpResponseForbidden('Invalid HTTP verb')
+ if request.method != 'PUT':
+ return HttpResponseForbidden('Invalid HTTP verb')
- apiclient = get_object_or_404(ApiClient, apikey=request.META['HTTP_X_APIKEY'])
- msg = get_object_or_404(Message, messageid=msgid)
+ apiclient = get_object_or_404(ApiClient, apikey=request.META['HTTP_X_APIKEY'])
+ msg = get_object_or_404(Message, messageid=msgid)
- (obj, created) = ThreadSubscription.objects.get_or_create(apiclient=apiclient,
- threadid=msg.threadid)
- if created:
- return HttpResponse(status=201)
- else:
- return HttpResponse(status=200)
+ (obj, created) = ThreadSubscription.objects.get_or_create(apiclient=apiclient,
+ threadid=msg.threadid)
+ if created:
+ return HttpResponse(status=201)
+ else:
+ return HttpResponse(status=200)
# We're intentionally putting the prefix text in the array here, since
# we might need that flexibility in the future.
hide_reasons = [
- None, # placeholder for 0
- 'This message has been hidden because a virus was found in the message.', # 1
- 'This message has been hidden because the message violated policies.', # 2
- 'This message has been hidden because for privacy reasons.', # 3
- 'This message was corrupt', # 4
- ]
+ None, # placeholder for 0
+ 'This message has been hidden because a virus was found in the message.', # 1
+ 'This message has been hidden because the message violated policies.', # 2
+ 'This message has been hidden because for privacy reasons.', # 3
+ 'This message was corrupt', # 4
+ ]
class Message(models.Model):
- threadid = models.IntegerField(null=False, blank=False)
- mailfrom = models.TextField(null=False, db_column='_from')
- to = models.TextField(null=False, db_column='_to')
- cc = models.TextField(null=False)
- subject = models.TextField(null=False)
- date = models.DateTimeField(null=False)
- messageid = models.TextField(null=False)
- bodytxt = models.TextField(null=False)
- # rawtxt is a bytea field, which django doesn't support (easily)
- parentid = models.IntegerField(null=False, blank=False)
- has_attachment = models.BooleanField(null=False, default=False)
- hiddenstatus = models.IntegerField(null=True)
- # fti is a tsvector field, which django doesn't support (easily)
-
- class Meta:
- db_table = 'messages'
-
- @property
- def printdate(self):
- return self.date.strftime("%Y-%m-%d %H:%M:%S")
-
- @property
- def shortdate(self):
- return self.date.strftime("%Y%m%d%H%M")
-
- # We explicitly cache the attachments here, so we can use them
- # multiple times from templates without generating multiple queries
- # to the database.
- _attachments = None
- @property
- def attachments(self):
- if not self._attachments:
- self._attachments = self.attachment_set.extra(select={'len': 'length(attachment)'}).all()
- return self._attachments
-
- @property
- def hiddenreason(self):
- if not self.hiddenstatus: return None
- try:
- return hide_reasons[self.hiddenstatus]
- except:
- # Weird value
- return 'This message has been hidden.'
+ threadid = models.IntegerField(null=False, blank=False)
+ mailfrom = models.TextField(null=False, db_column='_from')
+ to = models.TextField(null=False, db_column='_to')
+ cc = models.TextField(null=False)
+ subject = models.TextField(null=False)
+ date = models.DateTimeField(null=False)
+ messageid = models.TextField(null=False)
+ bodytxt = models.TextField(null=False)
+ # rawtxt is a bytea field, which django doesn't support (easily)
+ parentid = models.IntegerField(null=False, blank=False)
+ has_attachment = models.BooleanField(null=False, default=False)
+ hiddenstatus = models.IntegerField(null=True)
+ # fti is a tsvector field, which django doesn't support (easily)
+
+ class Meta:
+ db_table = 'messages'
+
+ @property
+ def printdate(self):
+ return self.date.strftime("%Y-%m-%d %H:%M:%S")
+
+ @property
+ def shortdate(self):
+ return self.date.strftime("%Y%m%d%H%M")
+
+ # We explicitly cache the attachments here, so we can use them
+ # multiple times from templates without generating multiple queries
+ # to the database.
+ _attachments = None
+ @property
+ def attachments(self):
+ if not self._attachments:
+ self._attachments = self.attachment_set.extra(select={'len': 'length(attachment)'}).all()
+ return self._attachments
+
+ @property
+ def hiddenreason(self):
+ if not self.hiddenstatus: return None
+ try:
+ return hide_reasons[self.hiddenstatus]
+ except:
+ # Weird value
+ return 'This message has been hidden.'
class ListGroup(models.Model):
- groupid = models.IntegerField(null=False, primary_key=True)
- groupname = models.CharField(max_length=200, null=False, blank=False)
- sortkey = models.IntegerField(null=False)
+ groupid = models.IntegerField(null=False, primary_key=True)
+ groupname = models.CharField(max_length=200, null=False, blank=False)
+ sortkey = models.IntegerField(null=False)
- class Meta:
- db_table = 'listgroups'
+ class Meta:
+ db_table = 'listgroups'
class List(models.Model):
- listid = models.IntegerField(null=False, primary_key=True)
- listname = models.CharField(max_length=200, null=False, blank=False, unique=True)
- shortdesc = models.TextField(null=False, blank=False)
- description = models.TextField(null=False, blank=False)
- active = models.BooleanField(null=False, blank=False)
- group = models.ForeignKey(ListGroup, db_column='groupid')
- subscriber_access = models.BooleanField(null=False, blank=False, default=False, help_text="Subscribers can access contents (default is admins only)")
+ listid = models.IntegerField(null=False, primary_key=True)
+ listname = models.CharField(max_length=200, null=False, blank=False, unique=True)
+ shortdesc = models.TextField(null=False, blank=False)
+ description = models.TextField(null=False, blank=False)
+ active = models.BooleanField(null=False, blank=False)
+ group = models.ForeignKey(ListGroup, db_column='groupid')
+ subscriber_access = models.BooleanField(null=False, blank=False, default=False, help_text="Subscribers can access contents (default is admins only)")
- @property
- def maybe_shortdesc(self):
- if self.shortdesc:
- return self.shortdesc
- return self.listname
+ @property
+ def maybe_shortdesc(self):
+ if self.shortdesc:
+ return self.shortdesc
+ return self.listname
- class Meta:
- db_table = 'lists'
+ class Meta:
+ db_table = 'lists'
class Attachment(models.Model):
- message = models.ForeignKey(Message, null=False, blank=False, db_column='message')
- filename = models.CharField(max_length=1000, null=False, blank=False)
- contenttype = models.CharField(max_length=1000, null=False, blank=False)
- # attachment = bytea, not supported by django at this point
+ message = models.ForeignKey(Message, null=False, blank=False, db_column='message')
+ filename = models.CharField(max_length=1000, null=False, blank=False)
+ contenttype = models.CharField(max_length=1000, null=False, blank=False)
+ # attachment = bytea, not supported by django at this point
- class Meta:
- db_table = 'attachments'
- # Predictable same-as-insert order
- ordering = ('id',)
+ class Meta:
+ db_table = 'attachments'
+ # Predictable same-as-insert order
+ ordering = ('id',)
- def inlineable(self):
- # Return True if this image should be inlined
- if self.contenttype in ('image/png', 'image/gif', 'image/jpg', 'image/jpeg'):
- # Note! len needs to be set with extra(select=)
- if self.len < 75000:
- return True
- return False
+ def inlineable(self):
+ # Return True if this image should be inlined
+ if self.contenttype in ('image/png', 'image/gif', 'image/jpg', 'image/jpeg'):
+ # Note! len needs to be set with extra(select=)
+ if self.len < 75000:
+ return True
+ return False
class ListSubscriber(models.Model):
- # Only used when public access is not allowed.
- # We set the username of the community account instead of a
- # foreign key, because the user might not exist.
- list = models.ForeignKey(List, null=False, blank=False)
- username = models.CharField(max_length=30, null=False, blank=False)
+ # Only used when public access is not allowed.
+ # We set the username of the community account instead of a
+ # foreign key, because the user might not exist.
+ list = models.ForeignKey(List, null=False, blank=False)
+ username = models.CharField(max_length=30, null=False, blank=False)
- class Meta:
- unique_together = (('list', 'username'), )
- db_table = 'listsubscribers'
+ class Meta:
+ unique_together = (('list', 'username'), )
+ db_table = 'listsubscribers'
class ApiClient(models.Model):
- apikey = models.CharField(max_length=100, null=False, blank=False)
- postback = models.URLField(max_length=500, null=False, blank=False)
+ apikey = models.CharField(max_length=100, null=False, blank=False)
+ postback = models.URLField(max_length=500, null=False, blank=False)
- class Meta:
- db_table = 'apiclients'
+ class Meta:
+ db_table = 'apiclients'
class ThreadSubscription(models.Model):
- apiclient = models.ForeignKey(ApiClient, null=False, blank=False)
- threadid = models.IntegerField(null=False, blank=False)
+ apiclient = models.ForeignKey(ApiClient, null=False, blank=False)
+ threadid = models.IntegerField(null=False, blank=False)
- class Meta:
- db_table = 'threadsubscriptions'
- unique_together = (('apiclient', 'threadid'),)
+ class Meta:
+ db_table = 'threadsubscriptions'
+ unique_together = (('apiclient', 'threadid'),)
from django import shortcuts
class ERedirect(Exception):
- def __init__(self, url):
- self.url = url
+ def __init__(self, url):
+ self.url = url
class RedirectMiddleware(object):
- def process_exception(self, request, exception):
- if isinstance(exception, ERedirect):
- return shortcuts.redirect(exception.url)
+ def process_exception(self, request, exception):
+ if isinstance(exception, ERedirect):
+ return shortcuts.redirect(exception.url)
register = template.Library()
def _rewrite_email(value):
- return value.replace('@', '(at)').replace('.','(dot)')
+ return value.replace('@', '(at)').replace('.','(dot)')
@register.filter(name='hidemail')
@stringfilter
def hidemail(value):
- return _rewrite_email(value)
+ return _rewrite_email(value)
# A regular expression and replacement function to mangle email addresses.
#
# are mangled.
_re_mail = re.compile('(/m(essage-id)?/)?[^()<>@,;:\/\s"\'&|]+@[^()<>@,;:\/\s"\'&|]+')
def _rewrite_email_match(match):
- if match.group(1):
- return match.group(0) # was preceded by /message-id/
- else:
- return _rewrite_email(match.group(0))
+ if match.group(1):
+ return match.group(0) # was preceded by /message-id/
+ else:
+ return _rewrite_email(match.group(0))
@register.filter(name='hideallemail')
@stringfilter
def hideallemail(value):
- return _re_mail.sub(lambda x: _rewrite_email_match(x), value)
+ return _re_mail.sub(lambda x: _rewrite_email_match(x), value)
@register.filter(name='nameonly')
@stringfilter
def nameonly(value):
- (name, email) = parseaddr(value)
- if name:
- return name
- return email.split('@')[0]
+ (name, email) = parseaddr(value)
+ if name:
+ return name
+ return email.split('@')[0]
@register.filter(name='md5')
@stringfilter
def md5(value):
- return hashlib.md5(value.encode('utf8')).hexdigest()
+ return hashlib.md5(value.encode('utf8')).hexdigest()
# Ensure the user is logged in (if it's not public lists)
def ensure_logged_in(request):
- if settings.PUBLIC_ARCHIVES:
- return
- if hasattr(request, 'user') and request.user.is_authenticated():
- return
- raise ERedirect('%s?next=%s' % (settings.LOGIN_URL, request.path))
+ if settings.PUBLIC_ARCHIVES:
+ return
+ if hasattr(request, 'user') and request.user.is_authenticated():
+ return
+ raise ERedirect('%s?next=%s' % (settings.LOGIN_URL, request.path))
# Ensure the user has permissions to access a list. If not, raise
# a permissions exception.
def ensure_list_permissions(request, l):
- if settings.PUBLIC_ARCHIVES:
- return
- if hasattr(request, 'user') and request.user.is_authenticated():
- if request.user.is_superuser:
- return
- if l.subscriber_access and ListSubscriber.objects.filter(list=l, username=request.user.username).exists():
- return
- # Logged in but no access
- raise PermissionDenied("Access denied.")
-
- # Redirect to a login page
- raise ERedirect('%s?next=%s' % (settings.LOGIN_URL, request.path))
+ if settings.PUBLIC_ARCHIVES:
+ return
+ if hasattr(request, 'user') and request.user.is_authenticated():
+ if request.user.is_superuser:
+ return
+ if l.subscriber_access and ListSubscriber.objects.filter(list=l, username=request.user.username).exists():
+ return
+ # Logged in but no access
+ raise PermissionDenied("Access denied.")
+
+ # Redirect to a login page
+ raise ERedirect('%s?next=%s' % (settings.LOGIN_URL, request.path))
# Ensure the user has permissions to access a message. In order to view
# a message, the user must have permissions on *all* lists the thread
# appears on.
def ensure_message_permissions(request, msgid):
- if settings.PUBLIC_ARCHIVES:
- return
- if hasattr(request, 'user') and request.user.is_authenticated():
- if request.user.is_superuser:
- return
-
- curs = connection.cursor()
- curs.execute("""SELECT EXISTS (
+ if settings.PUBLIC_ARCHIVES:
+ return
+ if hasattr(request, 'user') and request.user.is_authenticated():
+ if request.user.is_superuser:
+ return
+
+ curs = connection.cursor()
+ curs.execute("""SELECT EXISTS (
SELECT 1 FROM list_threads
INNER JOIN messages ON messages.threadid=list_threads.threadid
WHERE messages.messageid=%(msgid)s
AND listsubscribers.username=%(username)s
)
)""", {
- 'msgid': msgid,
- 'username': request.user.username,
- })
- if not curs.fetchone()[0]:
- # This thread is not on any list that the user does not have permissions on.
- return
+ 'msgid': msgid,
+ 'username': request.user.username,
+ })
+ if not curs.fetchone()[0]:
+ # This thread is not on any list that the user does not have permissions on.
+ return
- # Logged in but no access
- raise PermissionDenied("Access denied.")
+ # Logged in but no access
+ raise PermissionDenied("Access denied.")
- # Redirect to a login page
- raise ERedirect('%s?next=%s' % (settings.LOGIN_URL, request.path))
+ # Redirect to a login page
+ raise ERedirect('%s?next=%s' % (settings.LOGIN_URL, request.path))
# Decorator to set cache age
def cache(days=0, hours=0, minutes=0, seconds=0):
- "Set the server to cache object a specified time. td must be a timedelta object"
- def _cache(fn):
- def __cache(request, *_args, **_kwargs):
- resp = fn(request, *_args, **_kwargs)
- if settings.PUBLIC_ARCHIVES:
- # Only set cache headers on public archives
- td = timedelta(hours=hours, minutes=minutes, seconds=seconds)
- resp['Cache-Control'] = 's-maxage=%s' % (td.days*3600*24 + td.seconds)
- return resp
- return __cache
- return _cache
+ "Set the server to cache object a specified time. td must be a timedelta object"
+ def _cache(fn):
+ def __cache(request, *_args, **_kwargs):
+ resp = fn(request, *_args, **_kwargs)
+ if settings.PUBLIC_ARCHIVES:
+ # Only set cache headers on public archives
+ td = timedelta(hours=hours, minutes=minutes, seconds=seconds)
+ resp['Cache-Control'] = 's-maxage=%s' % (td.days*3600*24 + td.seconds)
+ return resp
+ return __cache
+ return _cache
def nocache(fn):
- def _nocache(request, *_args, **_kwargs):
- resp = fn(request, *_args, **_kwargs)
- if settings.PUBLIC_ARCHIVES:
- # Only set cache headers on public archives
- resp['Cache-Control'] = 's-maxage=0'
- return resp
- return _nocache
+ def _nocache(request, *_args, **_kwargs):
+ resp = fn(request, *_args, **_kwargs)
+ if settings.PUBLIC_ARCHIVES:
+ # Only set cache headers on public archives
+ resp['Cache-Control'] = 's-maxage=0'
+ return resp
+ return _nocache
# Decorator to require http auth
def antispam_auth(fn):
- def _antispam_auth(request, *_args, **_kwargs):
- if not settings.PUBLIC_ARCHIVES:
- return fn(request, *_args, **_kwargs)
-
- if 'HTTP_AUTHORIZATION' in request.META:
- auth = request.META['HTTP_AUTHORIZATION'].split()
- if len(auth) != 2:
- return HttpResponseForbidden("Invalid authentication")
- if auth[0].lower() == "basic":
- user, pwd = base64.b64decode(auth[1]).decode('utf8', errors='ignore').split(':')
- if user == 'archives' and pwd == 'antispam':
- # Actually run the function if auth is correct
- resp = fn(request, *_args, **_kwargs)
- return resp
- # Require authentication
- response = HttpResponse()
- response.status_code = 401
- response['WWW-Authenticate'] = 'Basic realm="Please authenticate with user archives and password antispam"'
- return response
-
- return _antispam_auth
+ def _antispam_auth(request, *_args, **_kwargs):
+ if not settings.PUBLIC_ARCHIVES:
+ return fn(request, *_args, **_kwargs)
+
+ if 'HTTP_AUTHORIZATION' in request.META:
+ auth = request.META['HTTP_AUTHORIZATION'].split()
+ if len(auth) != 2:
+ return HttpResponseForbidden("Invalid authentication")
+ if auth[0].lower() == "basic":
+ user, pwd = base64.b64decode(auth[1]).decode('utf8', errors='ignore').split(':')
+ if user == 'archives' and pwd == 'antispam':
+ # Actually run the function if auth is correct
+ resp = fn(request, *_args, **_kwargs)
+ return resp
+ # Require authentication
+ response = HttpResponse()
+ response.status_code = 401
+ response['WWW-Authenticate'] = 'Basic realm="Please authenticate with user archives and password antispam"'
+ return response
+
+ return _antispam_auth
def get_all_groups_and_lists(request, listid=None):
- # Django doesn't (yet) support traversing the reverse relationship,
- # so we'll get all the lists and rebuild it backwards.
- if settings.PUBLIC_ARCHIVES or request.user.is_superuser:
- lists = List.objects.select_related('group').all().order_by('listname')
- else:
- lists = List.objects.select_related('group').filter(subscriber_access=True, listsubscriber__username=request.user.username).order_by('listname')
- listgroupid = None
- groups = {}
- for l in lists:
- if l.listid == listid:
- listgroupid = l.group.groupid
-
- if l.group.groupid in groups:
- groups[l.group.groupid]['lists'].append(l)
- else:
- groups[l.group.groupid] = {
- 'groupid': l.group.groupid,
- 'groupname': l.group.groupname,
- 'sortkey': l.group.sortkey,
- 'lists': [l,],
- 'homelink': 'list/group/%s' % l.group.groupid,
- }
-
- return (sorted(list(groups.values()), key=lambda g: g['sortkey']), listgroupid)
+ # Django doesn't (yet) support traversing the reverse relationship,
+ # so we'll get all the lists and rebuild it backwards.
+ if settings.PUBLIC_ARCHIVES or request.user.is_superuser:
+ lists = List.objects.select_related('group').all().order_by('listname')
+ else:
+ lists = List.objects.select_related('group').filter(subscriber_access=True, listsubscriber__username=request.user.username).order_by('listname')
+ listgroupid = None
+ groups = {}
+ for l in lists:
+ if l.listid == listid:
+ listgroupid = l.group.groupid
+
+ if l.group.groupid in groups:
+ groups[l.group.groupid]['lists'].append(l)
+ else:
+ groups[l.group.groupid] = {
+ 'groupid': l.group.groupid,
+ 'groupname': l.group.groupname,
+ 'sortkey': l.group.sortkey,
+ 'lists': [l,],
+ 'homelink': 'list/group/%s' % l.group.groupid,
+ }
+
+ return (sorted(list(groups.values()), key=lambda g: g['sortkey']), listgroupid)
class NavContext(object):
- def __init__(self, request, listid=None, listname=None, all_groups=None, expand_groupid=None):
- self.request = request
- self.ctx = {}
-
- if all_groups:
- groups = copy.deepcopy(all_groups)
- if expand_groupid:
- listgroupid = int(expand_groupid)
- else:
- (groups, listgroupid) = get_all_groups_and_lists(request, listid)
-
- for g in groups:
- # On the root page, remove *all* entries
- # On other lists, remove the entries in all groups other than our
- # own.
- if (not listid and not expand_groupid) or listgroupid != g['groupid']:
- # Root page, so remove *all* entries
- g['lists'] = []
-
- self.ctx.update({'listgroups': groups})
- if listname:
- self.ctx.update({'searchform_listname': listname})
+ def __init__(self, request, listid=None, listname=None, all_groups=None, expand_groupid=None):
+ self.request = request
+ self.ctx = {}
+
+ if all_groups:
+ groups = copy.deepcopy(all_groups)
+ if expand_groupid:
+ listgroupid = int(expand_groupid)
+ else:
+ (groups, listgroupid) = get_all_groups_and_lists(request, listid)
+
+ for g in groups:
+ # On the root page, remove *all* entries
+ # On other lists, remove the entries in all groups other than our
+ # own.
+ if (not listid and not expand_groupid) or listgroupid != g['groupid']:
+ # Root page, so remove *all* entries
+ g['lists'] = []
+
+ self.ctx.update({'listgroups': groups})
+ if listname:
+ self.ctx.update({'searchform_listname': listname})
def render_nav(navcontext, template, ctx):
- ctx.update(navcontext.ctx)
- return render(navcontext.request, template, ctx)
+ ctx.update(navcontext.ctx)
+ return render(navcontext.request, template, ctx)
@cache(hours=4)
def index(request):
- ensure_logged_in(request)
+ ensure_logged_in(request)
- (groups, listgroupid) = get_all_groups_and_lists(request)
- return render_nav(NavContext(request, all_groups=groups), 'index.html', {
- 'groups': [{'groupname': g['groupname'], 'lists': g['lists']} for g in groups],
- })
+ (groups, listgroupid) = get_all_groups_and_lists(request)
+ return render_nav(NavContext(request, all_groups=groups), 'index.html', {
+ 'groups': [{'groupname': g['groupname'], 'lists': g['lists']} for g in groups],
+ })
@cache(hours=8)
def groupindex(request, groupid):
- (groups, listgroupid) = get_all_groups_and_lists(request)
- mygroups = [{'groupname': g['groupname'], 'lists': g['lists']} for g in groups if g['groupid']==int(groupid)]
- if len(mygroups) == 0:
- raise Http404('List group does not exist')
+ (groups, listgroupid) = get_all_groups_and_lists(request)
+ mygroups = [{'groupname': g['groupname'], 'lists': g['lists']} for g in groups if g['groupid']==int(groupid)]
+ if len(mygroups) == 0:
+ raise Http404('List group does not exist')
- return render_nav(NavContext(request, all_groups=groups, expand_groupid=groupid), 'index.html', {
- 'groups': mygroups,
- })
+ return render_nav(NavContext(request, all_groups=groups, expand_groupid=groupid), 'index.html', {
+ 'groups': mygroups,
+ })
@cache(hours=8)
def monthlist(request, listname):
- l = get_object_or_404(List, listname=listname)
- ensure_list_permissions(request, l)
+ l = get_object_or_404(List, listname=listname)
+ ensure_list_permissions(request, l)
- curs = connection.cursor()
- curs.execute("SELECT year, month FROM list_months WHERE listid=%(listid)s ORDER BY year DESC, month DESC", {'listid': l.listid})
- months=[{'year':r[0],'month':r[1], 'date':datetime(r[0],r[1],1)} for r in curs.fetchall()]
+ curs = connection.cursor()
+ curs.execute("SELECT year, month FROM list_months WHERE listid=%(listid)s ORDER BY year DESC, month DESC", {'listid': l.listid})
+ months=[{'year':r[0],'month':r[1], 'date':datetime(r[0],r[1],1)} for r in curs.fetchall()]
- return render_nav(NavContext(request, l.listid, l.listname), 'monthlist.html', {
- 'list': l,
- 'months': months,
- })
+ return render_nav(NavContext(request, l.listid, l.listname), 'monthlist.html', {
+ 'list': l,
+ 'months': months,
+ })
def get_monthday_info(mlist, l, d):
- allmonths = set([m.date.month for m in mlist])
- monthdate = None
- daysinmonth = None
- if len(allmonths) == 1:
- # All hits are from one month, so generate month links
- monthdate = mlist[0].date
- elif len(allmonths) == 0:
- # No hits at all, so generate month links from the specified date
- monthdate = d
-
- if monthdate:
- curs = connection.cursor()
- curs.execute("SELECT DISTINCT extract(day FROM date) FROM messages WHERE date >= %(startdate)s AND date < %(enddate)s AND threadid IN (SELECT threadid FROM list_threads WHERE listid=%(listid)s) ORDER BY 1", {
- 'startdate': datetime(year=monthdate.year, month=monthdate.month, day=1),
- 'enddate': monthdate + timedelta(days=calendar.monthrange(monthdate.year, monthdate.month)[1]),
- 'listid': l.listid,
- })
- daysinmonth = [int(r[0]) for r in curs.fetchall()]
-
- yearmonth = None
- if monthdate:
- yearmonth = "%s%02d" % (monthdate.year, monthdate.month)
- return (yearmonth, daysinmonth)
+ allmonths = set([m.date.month for m in mlist])
+ monthdate = None
+ daysinmonth = None
+ if len(allmonths) == 1:
+ # All hits are from one month, so generate month links
+ monthdate = mlist[0].date
+ elif len(allmonths) == 0:
+ # No hits at all, so generate month links from the specified date
+ monthdate = d
+
+ if monthdate:
+ curs = connection.cursor()
+ curs.execute("SELECT DISTINCT extract(day FROM date) FROM messages WHERE date >= %(startdate)s AND date < %(enddate)s AND threadid IN (SELECT threadid FROM list_threads WHERE listid=%(listid)s) ORDER BY 1", {
+ 'startdate': datetime(year=monthdate.year, month=monthdate.month, day=1),
+ 'enddate': monthdate + timedelta(days=calendar.monthrange(monthdate.year, monthdate.month)[1]),
+ 'listid': l.listid,
+ })
+ daysinmonth = [int(r[0]) for r in curs.fetchall()]
+
+ yearmonth = None
+ if monthdate:
+ yearmonth = "%s%02d" % (monthdate.year, monthdate.month)
+ return (yearmonth, daysinmonth)
def _render_datelist(request, l, d, datefilter, title, queryproc):
- # NOTE! Basic permissions checks must be done before calling this function!
-
- if not settings.PUBLIC_ARCHIVES and not request.user.is_superuser:
- mlist = Message.objects.defer('bodytxt', 'cc', 'to').select_related().filter(datefilter, hiddenstatus__isnull=True).extra(
- where=["threadid IN (SELECT threadid FROM list_threads t WHERE listid=%s AND NOT EXISTS (SELECT 1 FROM list_threads t2 WHERE t2.threadid=t.threadid AND listid NOT IN (SELECT list_id FROM listsubscribers WHERE username=%s)))"],
- params=(l.listid, request.user.username),
- )
- else:
- # Else we return everything
- mlist = Message.objects.defer('bodytxt', 'cc', 'to').select_related().filter(datefilter, hiddenstatus__isnull=True).extra(where=["threadid IN (SELECT threadid FROM list_threads WHERE listid=%s)" % l.listid])
- mlist = queryproc(mlist)
-
- allyearmonths = set([(m.date.year, m.date.month) for m in mlist])
- (yearmonth, daysinmonth) = get_monthday_info(mlist, l, d)
-
- r = render_nav(NavContext(request, l.listid, l.listname), 'datelist.html', {
- 'list': l,
- 'messages': mlist,
- 'title': title,
- 'daysinmonth': daysinmonth,
- 'yearmonth': yearmonth,
- })
- r['X-pglm'] = ':%s:' % (':'.join(['%s/%s/%s' % (l.listid, year, month) for year,month in allyearmonths]))
- return r
+ # NOTE! Basic permissions checks must be done before calling this function!
+
+ if not settings.PUBLIC_ARCHIVES and not request.user.is_superuser:
+ mlist = Message.objects.defer('bodytxt', 'cc', 'to').select_related().filter(datefilter, hiddenstatus__isnull=True).extra(
+ where=["threadid IN (SELECT threadid FROM list_threads t WHERE listid=%s AND NOT EXISTS (SELECT 1 FROM list_threads t2 WHERE t2.threadid=t.threadid AND listid NOT IN (SELECT list_id FROM listsubscribers WHERE username=%s)))"],
+ params=(l.listid, request.user.username),
+ )
+ else:
+ # Else we return everything
+ mlist = Message.objects.defer('bodytxt', 'cc', 'to').select_related().filter(datefilter, hiddenstatus__isnull=True).extra(where=["threadid IN (SELECT threadid FROM list_threads WHERE listid=%s)" % l.listid])
+ mlist = queryproc(mlist)
+
+ allyearmonths = set([(m.date.year, m.date.month) for m in mlist])
+ (yearmonth, daysinmonth) = get_monthday_info(mlist, l, d)
+
+ r = render_nav(NavContext(request, l.listid, l.listname), 'datelist.html', {
+ 'list': l,
+ 'messages': mlist,
+ 'title': title,
+ 'daysinmonth': daysinmonth,
+ 'yearmonth': yearmonth,
+ })
+ r['X-pglm'] = ':%s:' % (':'.join(['%s/%s/%s' % (l.listid, year, month) for year,month in allyearmonths]))
+ return r
def render_datelist_from(request, l, d, title, to=None):
- # NOTE! Basic permissions checks must be done before calling this function!
- datefilter = Q(date__gte=d)
- if to:
- datefilter.add(Q(date__lt=to), Q.AND)
+ # NOTE! Basic permissions checks must be done before calling this function!
+ datefilter = Q(date__gte=d)
+ if to:
+ datefilter.add(Q(date__lt=to), Q.AND)
- return _render_datelist(request, l, d, datefilter, title,
- lambda x: list(x.order_by('date')[:200]))
+ return _render_datelist(request, l, d, datefilter, title,
+ lambda x: list(x.order_by('date')[:200]))
def render_datelist_to(request, l, d, title):
- # NOTE! Basic permissions checks must be done before calling this function!
+ # NOTE! Basic permissions checks must be done before calling this function!
- # Need to sort this backwards in the database to get the LIMIT applied
- # properly, and then manually resort it in the correct order. We can do
- # the second sort safely in python since it's not a lot of items..
+ # Need to sort this backwards in the database to get the LIMIT applied
+ # properly, and then manually resort it in the correct order. We can do
+ # the second sort safely in python since it's not a lot of items..
- return _render_datelist(request, l, d, Q(date__lte=d), title,
- lambda x: sorted(x.order_by('-date')[:200], key=lambda m: m.date))
+ return _render_datelist(request, l, d, Q(date__lte=d), title,
+ lambda x: sorted(x.order_by('-date')[:200], key=lambda m: m.date))
@cache(hours=2)
def datelistsince(request, listname, msgid):
- l = get_object_or_404(List, listname=listname)
- ensure_list_permissions(request, l)
+ l = get_object_or_404(List, listname=listname)
+ ensure_list_permissions(request, l)
- msg = get_object_or_404(Message, messageid=msgid)
- return render_datelist_from(request, l, msg.date, "%s since %s" % (l.listname, msg.date.strftime("%Y-%m-%d %H:%M:%S")))
+ msg = get_object_or_404(Message, messageid=msgid)
+ return render_datelist_from(request, l, msg.date, "%s since %s" % (l.listname, msg.date.strftime("%Y-%m-%d %H:%M:%S")))
# Longer cache since this will be used for the fixed date links
@cache(hours=4)
def datelistsincetime(request, listname, year, month, day, hour, minute):
- l = get_object_or_404(List, listname=listname)
- ensure_list_permissions(request, l)
+ l = get_object_or_404(List, listname=listname)
+ ensure_list_permissions(request, l)
- try:
- d = datetime(int(year), int(month), int(day), int(hour), int(minute))
- except ValueError:
- raise Http404("Invalid date format, not found")
- return render_datelist_from(request, l, d, "%s since %s" % (l.listname, d.strftime("%Y-%m-%d %H:%M")))
+ try:
+ d = datetime(int(year), int(month), int(day), int(hour), int(minute))
+ except ValueError:
+ raise Http404("Invalid date format, not found")
+ return render_datelist_from(request, l, d, "%s since %s" % (l.listname, d.strftime("%Y-%m-%d %H:%M")))
@cache(hours=2)
def datelistbefore(request, listname, msgid):
- l = get_object_or_404(List, listname=listname)
- ensure_list_permissions(request, l)
+ l = get_object_or_404(List, listname=listname)
+ ensure_list_permissions(request, l)
- msg = get_object_or_404(Message, messageid=msgid)
- return render_datelist_to(request, l, msg.date, "%s before %s" % (l.listname, msg.date.strftime("%Y-%m-%d %H:%M:%S")))
+ msg = get_object_or_404(Message, messageid=msgid)
+ return render_datelist_to(request, l, msg.date, "%s before %s" % (l.listname, msg.date.strftime("%Y-%m-%d %H:%M:%S")))
@cache(hours=2)
def datelistbeforetime(request, listname, year, month, day, hour, minute):
- l = get_object_or_404(List, listname=listname)
- ensure_list_permissions(request, l)
+ l = get_object_or_404(List, listname=listname)
+ ensure_list_permissions(request, l)
- try:
- d = datetime(int(year), int(month), int(day), int(hour), int(minute))
- except ValueError:
- raise Http404("Invalid date format, not found")
- return render_datelist_to(request, l, d, "%s before %s" % (l.listname, d.strftime("%Y-%m-%d %H:%M")))
+ try:
+ d = datetime(int(year), int(month), int(day), int(hour), int(minute))
+ except ValueError:
+ raise Http404("Invalid date format, not found")
+ return render_datelist_to(request, l, d, "%s before %s" % (l.listname, d.strftime("%Y-%m-%d %H:%M")))
@cache(hours=4)
def datelist(request, listname, year, month):
- l = get_object_or_404(List, listname=listname)
- ensure_list_permissions(request, l)
+ l = get_object_or_404(List, listname=listname)
+ ensure_list_permissions(request, l)
- try:
- d = datetime(int(year), int(month), 1)
- except ValueError:
- raise Http404("Malformatted date, month not found")
+ try:
+ d = datetime(int(year), int(month), 1)
+ except ValueError:
+ raise Http404("Malformatted date, month not found")
- enddate = d+timedelta(days=31)
- enddate = datetime(enddate.year, enddate.month, 1)
- return render_datelist_from(request, l, d, "%s - %s %s" % (l.listname, d.strftime("%B"), d.year), enddate)
+ enddate = d+timedelta(days=31)
+ enddate = datetime(enddate.year, enddate.month, 1)
+ return render_datelist_from(request, l, d, "%s - %s %s" % (l.listname, d.strftime("%B"), d.year), enddate)
@cache(hours=4)
def attachment(request, attid):
- # Use a direct query instead of django, since it has bad support for
- # bytea
- # XXX: minor information leak, because we load the whole attachment before we check
- # the thread permissions. Is that OK?
- curs = connection.cursor()
- curs.execute("SELECT filename, contenttype, messageid, attachment FROM attachments INNER JOIN messages ON messages.id=attachments.message AND attachments.id=%(id)s AND messages.hiddenstatus IS NULL", {'id': int(attid)})
- r = curs.fetchall()
- if len(r) != 1:
- return HttpResponse("Attachment not found")
+ # Use a direct query instead of django, since it has bad support for
+ # bytea
+ # XXX: minor information leak, because we load the whole attachment before we check
+ # the thread permissions. Is that OK?
+ curs = connection.cursor()
+ curs.execute("SELECT filename, contenttype, messageid, attachment FROM attachments INNER JOIN messages ON messages.id=attachments.message AND attachments.id=%(id)s AND messages.hiddenstatus IS NULL", {'id': int(attid)})
+ r = curs.fetchall()
+ if len(r) != 1:
+ return HttpResponse("Attachment not found")
- ensure_message_permissions(request, r[0][2])
+ ensure_message_permissions(request, r[0][2])
- return HttpResponse(r[0][3], content_type=r[0][1])
+ return HttpResponse(r[0][3], content_type=r[0][1])
def _build_thread_structure(threadid):
- # Yeah, this is *way* too complicated for the django ORM
- curs = connection.cursor()
- curs.execute("""WITH RECURSIVE t(id, _from, subject, date, messageid, has_attachment, parentid, datepath) AS(
+ # Yeah, this is *way* too complicated for the django ORM
+ curs = connection.cursor()
+ curs.execute("""WITH RECURSIVE t(id, _from, subject, date, messageid, has_attachment, parentid, datepath) AS(
SELECT id,_from,subject,date,messageid,has_attachment,parentid,array[]::timestamptz[] FROM messages m WHERE m.threadid=%(threadid)s AND parentid IS NULL
UNION ALL
SELECT m.id,m._from,m.subject,m.date,m.messageid,m.has_attachment,m.parentid,t.datepath||t.date FROM messages m INNER JOIN t ON t.id=m.parentid WHERE m.threadid=%(threadid)s
SELECT id,_from,subject,date,messageid,has_attachment,parentid,datepath FROM t ORDER BY datepath||date
""", {'threadid': threadid})
- for id,_from,subject,date,messageid,has_attachment,parentid,parentpath in curs.fetchall():
- yield {'id':id, 'mailfrom':_from, 'subject': subject, 'date': date, 'printdate': date.strftime("%Y-%m-%d %H:%M:%S"), 'messageid': messageid, 'hasattachment': has_attachment, 'parentid': parentid, 'indent': " " * len(parentpath)}
+ for id,_from,subject,date,messageid,has_attachment,parentid,parentpath in curs.fetchall():
+ yield {'id':id, 'mailfrom':_from, 'subject': subject, 'date': date, 'printdate': date.strftime("%Y-%m-%d %H:%M:%S"), 'messageid': messageid, 'hasattachment': has_attachment, 'parentid': parentid, 'indent': " " * len(parentpath)}
def _get_nextprevious(listmap, dt):
- curs = connection.cursor()
- curs.execute("""WITH l(listid) AS (
+ curs = connection.cursor()
+ curs.execute("""WITH l(listid) AS (
SELECT unnest(%(lists)s)
)
SELECT l.listid,1,
(SELECT ARRAY[messageid,to_char(date, 'yyyy-mm-dd hh24:mi:ss'),subject,_from] FROM messages m
- INNER JOIN list_threads lt ON lt.threadid=m.threadid
- WHERE m.date>%(time)s AND lt.listid=l.listid
- ORDER BY m.date LIMIT 1
+ INNER JOIN list_threads lt ON lt.threadid=m.threadid
+ WHERE m.date>%(time)s AND lt.listid=l.listid
+ ORDER BY m.date LIMIT 1
) FROM l
UNION ALL
SELECT l.listid,0,
(SELECT ARRAY[messageid,to_char(date, 'yyyy-mm-dd hh24:mi:ss'),subject,_from] FROM messages m
- INNER JOIN list_threads lt ON lt.threadid=m.threadid
- WHERE m.date<%(time)s AND lt.listid=l.listid
- ORDER BY m.date DESC LIMIT 1
+ INNER JOIN list_threads lt ON lt.threadid=m.threadid
+ WHERE m.date<%(time)s AND lt.listid=l.listid
+ ORDER BY m.date DESC LIMIT 1
) FROM l""", {
- 'lists': list(listmap.keys()),
- 'time': dt,
- })
- retval = {}
- for listid, isnext, data in curs.fetchall():
- if data:
- # Can be NULL, but if not, it will always have all fields
- listname = listmap[listid]
- d = {
- 'msgid': data[0],
- 'date': data[1],
- 'subject': data[2],
- 'from': data[3],
- }
- if listname in retval:
- retval[listname][isnext and 'next' or 'prev'] = d
- else:
- retval[listname] = {
- isnext and 'next' or 'prev': d
- }
- return retval
+ 'lists': list(listmap.keys()),
+ 'time': dt,
+ })
+ retval = {}
+ for listid, isnext, data in curs.fetchall():
+ if data:
+ # Can be NULL, but if not, it will always have all fields
+ listname = listmap[listid]
+ d = {
+ 'msgid': data[0],
+ 'date': data[1],
+ 'subject': data[2],
+ 'from': data[3],
+ }
+ if listname in retval:
+ retval[listname][isnext and 'next' or 'prev'] = d
+ else:
+ retval[listname] = {
+ isnext and 'next' or 'prev': d
+ }
+ return retval
@cache(hours=4)
def message(request, msgid):
- ensure_message_permissions(request, msgid)
-
- try:
- m = Message.objects.get(messageid=msgid)
- except Message.DoesNotExist:
- raise Http404('Message does not exist')
-
- lists = List.objects.extra(where=["listid IN (SELECT listid FROM list_threads WHERE threadid=%s)" % m.threadid]).order_by('listname')
- listmap = dict([(l.listid, l.listname) for l in lists])
- threadstruct = list(_build_thread_structure(m.threadid))
- newest = calendar.timegm(max(threadstruct, key=lambda x: x['date'])['date'].utctimetuple())
- if 'HTTP_IF_MODIFIED_SINCE' in request.META and not settings.DEBUG:
- ims = parse_http_date_safe(request.META.get("HTTP_IF_MODIFIED_SINCE"))
- if ims >= newest:
- return HttpResponseNotModified()
-
- responses = [t for t in threadstruct if t['parentid']==m.id]
-
- if m.parentid:
- for t in threadstruct:
- if t['id'] == m.parentid:
- parent = t
- break
- else:
- parent = None
- nextprev = _get_nextprevious(listmap, m.date)
-
- r = render_nav(NavContext(request, lists[0].listid, lists[0].listname), 'message.html', {
- 'msg': m,
- 'threadstruct': threadstruct,
- 'responses': responses,
- 'parent': parent,
- 'lists': lists,
- 'nextprev': nextprev,
- })
- r['X-pgthread'] = ":%s:" % m.threadid
- r['Last-Modified'] = http_date(newest)
- return r
+ ensure_message_permissions(request, msgid)
+
+ try:
+ m = Message.objects.get(messageid=msgid)
+ except Message.DoesNotExist:
+ raise Http404('Message does not exist')
+
+ lists = List.objects.extra(where=["listid IN (SELECT listid FROM list_threads WHERE threadid=%s)" % m.threadid]).order_by('listname')
+ listmap = dict([(l.listid, l.listname) for l in lists])
+ threadstruct = list(_build_thread_structure(m.threadid))
+ newest = calendar.timegm(max(threadstruct, key=lambda x: x['date'])['date'].utctimetuple())
+ if 'HTTP_IF_MODIFIED_SINCE' in request.META and not settings.DEBUG:
+ ims = parse_http_date_safe(request.META.get("HTTP_IF_MODIFIED_SINCE"))
+ if ims >= newest:
+ return HttpResponseNotModified()
+
+ responses = [t for t in threadstruct if t['parentid']==m.id]
+
+ if m.parentid:
+ for t in threadstruct:
+ if t['id'] == m.parentid:
+ parent = t
+ break
+ else:
+ parent = None
+ nextprev = _get_nextprevious(listmap, m.date)
+
+ r = render_nav(NavContext(request, lists[0].listid, lists[0].listname), 'message.html', {
+ 'msg': m,
+ 'threadstruct': threadstruct,
+ 'responses': responses,
+ 'parent': parent,
+ 'lists': lists,
+ 'nextprev': nextprev,
+ })
+ r['X-pgthread'] = ":%s:" % m.threadid
+ r['Last-Modified'] = http_date(newest)
+ return r
@cache(hours=4)
def message_flat(request, msgid):
- ensure_message_permissions(request, msgid)
-
- try:
- msg = Message.objects.get(messageid=msgid)
- except Message.DoesNotExist:
- raise Http404('Message does not exist')
- allmsg = list(Message.objects.filter(threadid=msg.threadid).order_by('date'))
- lists = List.objects.extra(where=["listid IN (SELECT listid FROM list_threads WHERE threadid=%s)" % msg.threadid]).order_by('listname')
-
- isfirst = (msg == allmsg[0])
-
- newest = calendar.timegm(max(allmsg, key=lambda x: x.date).date.utctimetuple())
- if 'HTTP_IF_MODIFIED_SINCE' in request.META and not settings.DEBUG:
- ims = parse_http_date_safe(request.META.get('HTTP_IF_MODIFIED_SINCE'))
- if ims >= newest:
- return HttpResponseNotModified()
-
- r = render_nav(NavContext(request), 'message_flat.html', {
- 'msg': msg,
- 'allmsg': allmsg,
- 'lists': lists,
- 'isfirst': isfirst,
- })
- r['X-pgthread'] = ":%s:" % msg.threadid
- r['Last-Modified'] = http_date(newest)
- return r
+ ensure_message_permissions(request, msgid)
+
+ try:
+ msg = Message.objects.get(messageid=msgid)
+ except Message.DoesNotExist:
+ raise Http404('Message does not exist')
+ allmsg = list(Message.objects.filter(threadid=msg.threadid).order_by('date'))
+ lists = List.objects.extra(where=["listid IN (SELECT listid FROM list_threads WHERE threadid=%s)" % msg.threadid]).order_by('listname')
+
+ isfirst = (msg == allmsg[0])
+
+ newest = calendar.timegm(max(allmsg, key=lambda x: x.date).date.utctimetuple())
+ if 'HTTP_IF_MODIFIED_SINCE' in request.META and not settings.DEBUG:
+ ims = parse_http_date_safe(request.META.get('HTTP_IF_MODIFIED_SINCE'))
+ if ims >= newest:
+ return HttpResponseNotModified()
+
+ r = render_nav(NavContext(request), 'message_flat.html', {
+ 'msg': msg,
+ 'allmsg': allmsg,
+ 'lists': lists,
+ 'isfirst': isfirst,
+ })
+ r['X-pgthread'] = ":%s:" % msg.threadid
+ r['Last-Modified'] = http_date(newest)
+ return r
@nocache
@antispam_auth
def message_raw(request, msgid):
- ensure_message_permissions(request, msgid)
+ ensure_message_permissions(request, msgid)
- curs = connection.cursor()
- curs.execute("SELECT threadid, hiddenstatus, rawtxt FROM messages WHERE messageid=%(messageid)s", {
- 'messageid': msgid,
- })
- row = curs.fetchall()
- if len(row) != 1:
- raise Http404('Message does not exist')
+ curs = connection.cursor()
+ curs.execute("SELECT threadid, hiddenstatus, rawtxt FROM messages WHERE messageid=%(messageid)s", {
+ 'messageid': msgid,
+ })
+ row = curs.fetchall()
+ if len(row) != 1:
+ raise Http404('Message does not exist')
- if row[0][1]:
- r = HttpResponse('This message has been hidden.', content_type='text/plain')
- else:
- r = HttpResponse(row[0][2], content_type='text/plain')
- r['X-pgthread'] = ":%s:" % row[0][0]
- return r
+ if row[0][1]:
+ r = HttpResponse('This message has been hidden.', content_type='text/plain')
+ else:
+ r = HttpResponse(row[0][2], content_type='text/plain')
+ r['X-pgthread'] = ":%s:" % row[0][0]
+ return r
def _build_mbox(query, params, msgid=None):
- connection.ensure_connection()
+ connection.ensure_connection()
- # Rawmsg is not in the django model, so we have to query it separately
- curs = connection.connection.cursor(name='mbox', withhold=True)
- curs.itersize = 50
- curs.execute(query, params)
+ # Rawmsg is not in the django model, so we have to query it separately
+ curs = connection.connection.cursor(name='mbox', withhold=True)
+ curs.itersize = 50
+ curs.execute(query, params)
- firstmsg = curs.fetchone()
- if msgid and firstmsg[0] != msgid:
- # Always redirect to the first message in the thread when building
- # the mbox, to not generate potentially multiple copies in
- # the cache.
- return HttpResponsePermanentRedirect(firstmsg[0])
+ firstmsg = curs.fetchone()
+ if msgid and firstmsg[0] != msgid:
+ # Always redirect to the first message in the thread when building
+ # the mbox, to not generate potentially multiple copies in
+ # the cache.
+ return HttpResponsePermanentRedirect(firstmsg[0])
- def _one_message(raw):
- # Parse as a message to generate headers
- s = BytesIO(raw)
- parser = email.parser.BytesParser(policy=email.policy.compat32)
- msg = parser.parse(s)
- return msg.as_string(unixfrom=True)
+ def _one_message(raw):
+ # Parse as a message to generate headers
+ s = BytesIO(raw)
+ parser = email.parser.BytesParser(policy=email.policy.compat32)
+ msg = parser.parse(s)
+ return msg.as_string(unixfrom=True)
- def _message_stream(first):
- yield _one_message(first[1])
+ def _message_stream(first):
+ yield _one_message(first[1])
- for mid, raw in curs:
- yield _one_message(raw)
+ for mid, raw in curs:
+ yield _one_message(raw)
- # Close must be done inside this function. If we close it in the
- # main function, it won't let the iterator run to completion.
- curs.close()
+ # Close must be done inside this function. If we close it in the
+ # main function, it won't let the iterator run to completion.
+ curs.close()
- r = StreamingHttpResponse(_message_stream(firstmsg))
- r['Content-type'] = 'application/mbox'
- return r
+ r = StreamingHttpResponse(_message_stream(firstmsg))
+ r['Content-type'] = 'application/mbox'
+ return r
@nocache
@antispam_auth
def message_mbox(request, msgid):
- ensure_message_permissions(request, msgid)
+ ensure_message_permissions(request, msgid)
- msg = get_object_or_404(Message, messageid=msgid)
+ msg = get_object_or_404(Message, messageid=msgid)
- return _build_mbox(
- "SELECT messageid, rawtxt FROM messages WHERE threadid=%(thread)s AND hiddenstatus IS NULL ORDER BY date",
- {
- 'thread': msg.threadid,
- },
- msgid)
+ return _build_mbox(
+ "SELECT messageid, rawtxt FROM messages WHERE threadid=%(thread)s AND hiddenstatus IS NULL ORDER BY date",
+ {
+ 'thread': msg.threadid,
+ },
+ msgid)
@nocache
@antispam_auth
def mbox(request, listname, listname2, mboxyear, mboxmonth):
- if (listname != listname2):
- raise Http404('List name mismatch')
- l = get_object_or_404(List, listname=listname)
- ensure_list_permissions(request, l)
-
- mboxyear = int(mboxyear)
- mboxmonth = int(mboxmonth)
-
- query = "SELECT messageid, rawtxt FROM messages m INNER JOIN list_threads t ON t.threadid=m.threadid WHERE listid=%(listid)s AND hiddenstatus IS NULL AND date >= %(startdate)s AND date <= %(enddate)s %%% ORDER BY date"
- params = {
- 'listid': l.listid,
- 'startdate': date(mboxyear, mboxmonth, 1),
- 'enddate': datetime(mboxyear, mboxmonth, calendar.monthrange(mboxyear, mboxmonth)[1], 23, 59, 59),
- }
-
- if not settings.PUBLIC_ARCHIVES and not request.user.is_superuser:
- # Restrict to only view messages that the user has permissions on all threads they're on
- query = query.replace('%%%', 'AND NOT EXISTS (SELECT 1 FROM list_threads t2 WHERE t2.threadid=t.threadid AND listid NOT IN (SELECT list_id FROM listsubscribers WHERE username=%(username)s))')
- params['username'] = request.user.username
- else:
- # Just return the whole thing
- query = query.replace('%%%', '')
- return _build_mbox(query, params)
+ if (listname != listname2):
+ raise Http404('List name mismatch')
+ l = get_object_or_404(List, listname=listname)
+ ensure_list_permissions(request, l)
+
+ mboxyear = int(mboxyear)
+ mboxmonth = int(mboxmonth)
+
+ query = "SELECT messageid, rawtxt FROM messages m INNER JOIN list_threads t ON t.threadid=m.threadid WHERE listid=%(listid)s AND hiddenstatus IS NULL AND date >= %(startdate)s AND date <= %(enddate)s %%% ORDER BY date"
+ params = {
+ 'listid': l.listid,
+ 'startdate': date(mboxyear, mboxmonth, 1),
+ 'enddate': datetime(mboxyear, mboxmonth, calendar.monthrange(mboxyear, mboxmonth)[1], 23, 59, 59),
+ }
+
+ if not settings.PUBLIC_ARCHIVES and not request.user.is_superuser:
+ # Restrict to only view messages that the user has permissions on all threads they're on
+ query = query.replace('%%%', 'AND NOT EXISTS (SELECT 1 FROM list_threads t2 WHERE t2.threadid=t.threadid AND listid NOT IN (SELECT list_id FROM listsubscribers WHERE username=%(username)s))')
+ params['username'] = request.user.username
+ else:
+ # Just return the whole thing
+ query = query.replace('%%%', '')
+ return _build_mbox(query, params)
def search(request):
- if not settings.PUBLIC_ARCHIVES:
- # We don't support searching of non-public archives at all at this point.
- # XXX: room for future improvement
- return HttpResponseForbidden('Not public archives')
-
- # Only certain hosts are allowed to call the search API
- if not request.META['REMOTE_ADDR'] in settings.SEARCH_CLIENTS:
- return HttpResponseForbidden('Invalid host')
-
- curs = connection.cursor()
-
- # Perform a search of the archives and return a JSON document.
- # Expects the following (optional) POST parameters:
- # q = query to search for
- # ln = comma separate list of listnames to search in
- # d = number of days back to search for, or -1 (or not specified)
- # to search the full archives
- # s = sort results by ['r'=rank, 'd'=date, 'i'=inverse date]
- if not request.method == 'POST':
- raise Http404('I only respond to POST')
-
- if 'q' not in request.POST:
- raise Http404('No search query specified')
- query = request.POST['q']
-
- if 'ln' in request.POST:
- try:
- curs.execute("SELECT listid FROM lists WHERE listname=ANY(%(names)s)", {
- 'names': request.POST['ln'].split(','),
- })
- lists = [x for x, in curs.fetchall()]
- except:
- # If failing to parse list of lists, just search all
- lists = None
- else:
- lists = None
-
- if 'd' in request.POST:
- days = int(request.POST['d'])
- if days < 1 or days > 365:
- firstdate = None
- else:
- firstdate = datetime.now() - timedelta(days=days)
- else:
- firstdate = None
-
- if 's' in request.POST:
- list_sort = request.POST['s']
- if not list_sort in ('d', 'r', 'i'):
- list_stort = 'r'
- else:
- list_sort = 'r'
-
- # Ok, we have all we need to do the search
-
- if query.find('@') > 0:
- # This could be a messageid. So try to get that one specifically first.
- # We don't do a more specific check if it's a messageid because doing
- # a key lookup is cheap...
- curs.execute("SELECT messageid FROM messages WHERE messageid=%(q)s", {
- 'q': query,
- })
- a = curs.fetchall()
- if len(a) == 1:
- # Yup, this was a messageid
- resp = HttpResponse(content_type='application/json')
-
- json.dump({'messageidmatch': 1}, resp)
- return resp
- # If not found, fall through to a regular search
-
- curs.execute("SET gin_fuzzy_search_limit=10000")
- qstr = "SELECT messageid, date, subject, _from, ts_rank_cd(fti, plainto_tsquery('public.pg', %(q)s)), ts_headline(bodytxt, plainto_tsquery('public.pg', %(q)s),'StartSel=\"[[[[[[\",StopSel=\"]]]]]]\"') FROM messages m WHERE fti @@ plainto_tsquery('public.pg', %(q)s)"
- params = {
- 'q': query,
- }
- if lists:
- qstr += " AND EXISTS (SELECT 1 FROM list_threads lt WHERE lt.threadid=m.threadid AND lt.listid=ANY(%(lists)s))"
- params['lists'] = lists
- if firstdate:
- qstr += " AND m.date > %(date)s"
- params['date'] = firstdate
- if list_sort == 'r':
- qstr += " ORDER BY ts_rank_cd(fti, plainto_tsquery(%(q)s)) DESC LIMIT 1000"
- elif list_sort == 'd':
- qstr += " ORDER BY date DESC LIMIT 1000"
- else:
- qstr += " ORDER BY date ASC LIMIT 1000"
-
- curs.execute(qstr, params)
-
- resp = HttpResponse(content_type='application/json')
-
- json.dump([{
- 'm': messageid,
- 'd': date.isoformat(),
- 's': subject,
- 'f': mailfrom,
- 'r': rank,
- 'a': abstract.replace("[[[[[[", "<b>").replace("]]]]]]","</b>"),
-
- } for messageid, date, subject, mailfrom, rank, abstract in curs.fetchall()],
- resp)
- return resp
+ if not settings.PUBLIC_ARCHIVES:
+ # We don't support searching of non-public archives at all at this point.
+ # XXX: room for future improvement
+ return HttpResponseForbidden('Not public archives')
+
+ # Only certain hosts are allowed to call the search API
+ if not request.META['REMOTE_ADDR'] in settings.SEARCH_CLIENTS:
+ return HttpResponseForbidden('Invalid host')
+
+ curs = connection.cursor()
+
+ # Perform a search of the archives and return a JSON document.
+ # Expects the following (optional) POST parameters:
+ # q = query to search for
+ # ln = comma separate list of listnames to search in
+ # d = number of days back to search for, or -1 (or not specified)
+ # to search the full archives
+ # s = sort results by ['r'=rank, 'd'=date, 'i'=inverse date]
+ if not request.method == 'POST':
+ raise Http404('I only respond to POST')
+
+ if 'q' not in request.POST:
+ raise Http404('No search query specified')
+ query = request.POST['q']
+
+ if 'ln' in request.POST:
+ try:
+ curs.execute("SELECT listid FROM lists WHERE listname=ANY(%(names)s)", {
+ 'names': request.POST['ln'].split(','),
+ })
+ lists = [x for x, in curs.fetchall()]
+ except:
+ # If failing to parse list of lists, just search all
+ lists = None
+ else:
+ lists = None
+
+ if 'd' in request.POST:
+ days = int(request.POST['d'])
+ if days < 1 or days > 365:
+ firstdate = None
+ else:
+ firstdate = datetime.now() - timedelta(days=days)
+ else:
+ firstdate = None
+
+ if 's' in request.POST:
+ list_sort = request.POST['s']
+ if not list_sort in ('d', 'r', 'i'):
+ list_stort = 'r'
+ else:
+ list_sort = 'r'
+
+ # Ok, we have all we need to do the search
+
+ if query.find('@') > 0:
+ # This could be a messageid. So try to get that one specifically first.
+ # We don't do a more specific check if it's a messageid because doing
+ # a key lookup is cheap...
+ curs.execute("SELECT messageid FROM messages WHERE messageid=%(q)s", {
+ 'q': query,
+ })
+ a = curs.fetchall()
+ if len(a) == 1:
+ # Yup, this was a messageid
+ resp = HttpResponse(content_type='application/json')
+
+ json.dump({'messageidmatch': 1}, resp)
+ return resp
+ # If not found, fall through to a regular search
+
+ curs.execute("SET gin_fuzzy_search_limit=10000")
+ qstr = "SELECT messageid, date, subject, _from, ts_rank_cd(fti, plainto_tsquery('public.pg', %(q)s)), ts_headline(bodytxt, plainto_tsquery('public.pg', %(q)s),'StartSel=\"[[[[[[\",StopSel=\"]]]]]]\"') FROM messages m WHERE fti @@ plainto_tsquery('public.pg', %(q)s)"
+ params = {
+ 'q': query,
+ }
+ if lists:
+ qstr += " AND EXISTS (SELECT 1 FROM list_threads lt WHERE lt.threadid=m.threadid AND lt.listid=ANY(%(lists)s))"
+ params['lists'] = lists
+ if firstdate:
+ qstr += " AND m.date > %(date)s"
+ params['date'] = firstdate
+ if list_sort == 'r':
+ qstr += " ORDER BY ts_rank_cd(fti, plainto_tsquery(%(q)s)) DESC LIMIT 1000"
+ elif list_sort == 'd':
+ qstr += " ORDER BY date DESC LIMIT 1000"
+ else:
+ qstr += " ORDER BY date ASC LIMIT 1000"
+
+ curs.execute(qstr, params)
+
+ resp = HttpResponse(content_type='application/json')
+
+ json.dump([{
+ 'm': messageid,
+ 'd': date.isoformat(),
+ 's': subject,
+ 'f': mailfrom,
+ 'r': rank,
+ 'a': abstract.replace("[[[[[[", "<b>").replace("]]]]]]","</b>"),
+
+ } for messageid, date, subject, mailfrom, rank, abstract in curs.fetchall()],
+ resp)
+ return resp
@cache(seconds=10)
def web_sync_timestamp(request):
- s = datetime.now().strftime("%Y-%m-%d %H:%M:%S\n")
- r = HttpResponse(s, content_type='text/plain')
- r['Content-Length'] = len(s)
- return r
+ s = datetime.now().strftime("%Y-%m-%d %H:%M:%S\n")
+ r = HttpResponse(s, content_type='text/plain')
+ r['Content-Length'] = len(s)
+ return r
@cache(hours=8)
def legacy(request, listname, year, month, msgnum):
- curs = connection.cursor()
- curs.execute("SELECT msgid FROM legacymap WHERE listid=(SELECT listid FROM lists WHERE listname=%(list)s) AND year=%(year)s AND month=%(month)s AND msgnum=%(msgnum)s", {
- 'list': listname,
- 'year': year,
- 'month': month,
- 'msgnum': msgnum,
- })
- r = curs.fetchall()
- if len(r) != 1:
- raise Http404('Message does not exist')
- return HttpResponsePermanentRedirect('/message-id/%s' % r[0][0])
+ curs = connection.cursor()
+ curs.execute("SELECT msgid FROM legacymap WHERE listid=(SELECT listid FROM lists WHERE listname=%(list)s) AND year=%(year)s AND month=%(month)s AND msgnum=%(msgnum)s", {
+ 'list': listname,
+ 'year': year,
+ 'month': month,
+ 'msgnum': msgnum,
+ })
+ r = curs.fetchall()
+ if len(r) != 1:
+ raise Http404('Message does not exist')
+ return HttpResponsePermanentRedirect('/message-id/%s' % r[0][0])
# dynamic CSS serving, meaning we merge a number of different CSS into a
# single one, making sure it turns into a single http response. We do this
# dynamically, since the output will be cached.
_dynamic_cssmap = {
- 'base': ['media/css/main.css',
- 'media/css/normalize.css',],
- 'docs': ['media/css/global.css',
- 'media/css/table.css',
- 'media/css/text.css',
- 'media/css/docs.css'],
- }
+ 'base': ['media/css/main.css',
+ 'media/css/normalize.css',],
+ 'docs': ['media/css/global.css',
+ 'media/css/table.css',
+ 'media/css/text.css',
+ 'media/css/docs.css'],
+ }
@cache(hours=8)
def dynamic_css(request, css):
- if css not in _dynamic_cssmap:
- raise Http404('CSS not found')
- files = _dynamic_cssmap[css]
- resp = HttpResponse(content_type='text/css')
-
- # We honor if-modified-since headers by looking at the most recently
- # touched CSS file.
- latestmod = 0
- for fn in files:
- try:
- stime = os.stat(fn).st_mtime
- if latestmod < stime:
- latestmod = stime
- except OSError:
- # If we somehow referred to a file that didn't exist, or
- # one that we couldn't access.
- raise Http404('CSS (sub) not found')
- if 'HTTP_IF_MODIFIED_SINCE' in request.META:
- # This code is mostly stolen from django :)
- matches = re.match(r"^([^;]+)(; length=([0-9]+))?$",
- request.META.get('HTTP_IF_MODIFIED_SINCE'),
- re.IGNORECASE)
- header_mtime = parse_http_date_safe(matches.group(1))
- # We don't do length checking, just the date
- if int(latestmod) <= header_mtime:
- return HttpResponseNotModified(content_type='text/css')
- resp['Last-Modified'] = http_date(latestmod)
-
- for fn in files:
- with open(fn) as f:
- resp.write("/* %s */\n" % fn)
- resp.write(f.read())
- resp.write("\n")
-
- return resp
+ if css not in _dynamic_cssmap:
+ raise Http404('CSS not found')
+ files = _dynamic_cssmap[css]
+ resp = HttpResponse(content_type='text/css')
+
+ # We honor if-modified-since headers by looking at the most recently
+ # touched CSS file.
+ latestmod = 0
+ for fn in files:
+ try:
+ stime = os.stat(fn).st_mtime
+ if latestmod < stime:
+ latestmod = stime
+ except OSError:
+ # If we somehow referred to a file that didn't exist, or
+ # one that we couldn't access.
+ raise Http404('CSS (sub) not found')
+ if 'HTTP_IF_MODIFIED_SINCE' in request.META:
+ # This code is mostly stolen from django :)
+ matches = re.match(r"^([^;]+)(; length=([0-9]+))?$",
+ request.META.get('HTTP_IF_MODIFIED_SINCE'),
+ re.IGNORECASE)
+ header_mtime = parse_http_date_safe(matches.group(1))
+ # We don't do length checking, just the date
+ if int(latestmod) <= header_mtime:
+ return HttpResponseNotModified(content_type='text/css')
+ resp['Last-Modified'] = http_date(latestmod)
+
+ for fn in files:
+ with open(fn) as f:
+ resp.write("/* %s */\n" % fn)
+ resp.write(f.read())
+ resp.write("\n")
+
+ return resp
# Redirect to the requested url, with a slash first. This is used to remove
# trailing slashes on messageid links by doing a permanent redirect. This is
# in the cache.
@cache(hours=8)
def slash_redirect(request, url):
- return HttpResponsePermanentRedirect("/%s" % url)
+ return HttpResponsePermanentRedirect("/%s" % url)
# Redirect the requested URL to whatever happens to be in the regexp capture.
# This is used for user agents that generate broken URLs that are easily
# captured using regexp.
@cache(hours=8)
def re_redirect(request, prefix, msgid):
- return HttpResponsePermanentRedirect("/%s%s" % (prefix, msgid))
+ return HttpResponsePermanentRedirect("/%s%s" % (prefix, msgid))
ROOT_URLCONF = 'archives.urls'
TEMPLATES = [{
- 'BACKEND': 'django.template.backends.django.DjangoTemplates',
- 'OPTIONS': {
- 'context_processors': [
- 'django.template.context_processors.request',
- 'django.contrib.messages.context_processors.messages',
- 'archives.util.PGWebContextProcessor',
- ],
- 'loaders': [
- 'django.template.loaders.filesystem.Loader',
- 'django.template.loaders.app_directories.Loader',
- ],
- },
+ 'BACKEND': 'django.template.backends.django.DjangoTemplates',
+ 'OPTIONS': {
+ 'context_processors': [
+ 'django.template.context_processors.request',
+ 'django.contrib.messages.context_processors.messages',
+ 'archives.util.PGWebContextProcessor',
+ ],
+ 'loaders': [
+ 'django.template.loaders.filesystem.Loader',
+ 'django.template.loaders.app_directories.Loader',
+ ],
+ },
}]
# 'django.contrib.admin',
# Uncomment the next line to enable admin documentation:
# 'django.contrib.admindocs',
- 'archives.mailarchives',
+ 'archives.mailarchives',
]
# A sample logging configuration. The only tangible logging
PUBLIC_ARCHIVES = False
try:
- from .settings_local import *
+ from .settings_local import *
except ImportError:
- pass
+ pass
# If this is a non-public site, enable middleware for handling logins etc
if not PUBLIC_ARCHIVES:
- MIDDLEWARE_CLASSES = [
- 'django.contrib.sessions.middleware.SessionMiddleware',
- 'django.contrib.auth.middleware.AuthenticationMiddleware',
- ] + MIDDLEWARE_CLASSES
- MIDDLEWARE_CLASSES.append('archives.mailarchives.redirecthandler.RedirectMiddleware')
-
- INSTALLED_APPS = [
- 'django.contrib.auth',
- 'django.contrib.contenttypes',
- 'django.contrib.sessions',
- ] + INSTALLED_APPS
-
- from archives.util import validate_new_user
- PGAUTH_CREATEUSER_CALLBACK=validate_new_user
+ MIDDLEWARE_CLASSES = [
+ 'django.contrib.sessions.middleware.SessionMiddleware',
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
+ ] + MIDDLEWARE_CLASSES
+ MIDDLEWARE_CLASSES.append('archives.mailarchives.redirecthandler.RedirectMiddleware')
+
+ INSTALLED_APPS = [
+ 'django.contrib.auth',
+ 'django.contrib.contenttypes',
+ 'django.contrib.sessions',
+ ] + INSTALLED_APPS
+
+ from archives.util import validate_new_user
+ PGAUTH_CREATEUSER_CALLBACK=validate_new_user
from django.utils.functional import SimpleLazyObject
def validate_new_user(username, email, firstname, lastname):
- # Only allow user creation if they are already a subscriber
- curs = connection.cursor()
- curs.execute("SELECT EXISTS(SELECT 1 FROM listsubscribers WHERE username=%(username)s)", {
- 'username': username,
- })
- if curs.fetchone()[0]:
- # User is subscribed to something, so allow creation
- return None
+ # Only allow user creation if they are already a subscriber
+ curs = connection.cursor()
+ curs.execute("SELECT EXISTS(SELECT 1 FROM listsubscribers WHERE username=%(username)s)", {
+ 'username': username,
+ })
+ if curs.fetchone()[0]:
+ # User is subscribed to something, so allow creation
+ return None
- return HttpResponse("You are not currently subscribed to any mailing list on this server. Account not created.")
+ return HttpResponse("You are not currently subscribed to any mailing list on this server. Account not created.")
def _get_gitrev():
- # Return the current git revision, that is used for
- # cache-busting URLs.
- try:
- with open('../.git/refs/heads/master') as f:
- return f.readline()[:8]
- except IOError:
- # A "git gc" will remove the ref and replace it with a packed-refs.
- try:
- with open('../.git/packed-refs') as f:
- for l in f.readlines():
- if l.endswith("refs/heads/master\n"):
- return l[:8]
- # Not found in packed-refs. Meh, just make one up.
- return 'ffffffff'
- except IOError:
- # If packed-refs also can't be read, just give up
- return 'eeeeeeee'
+ # Return the current git revision, that is used for
+ # cache-busting URLs.
+ try:
+ with open('../.git/refs/heads/master') as f:
+ return f.readline()[:8]
+ except IOError:
+ # A "git gc" will remove the ref and replace it with a packed-refs.
+ try:
+ with open('../.git/packed-refs') as f:
+ for l in f.readlines():
+ if l.endswith("refs/heads/master\n"):
+ return l[:8]
+ # Not found in packed-refs. Meh, just make one up.
+ return 'ffffffff'
+ except IOError:
+ # If packed-refs also can't be read, just give up
+ return 'eeeeeeee'
# Template context processor to add information about the root link and
# the current git revision. git revision is returned as a lazy object so
# we don't spend effort trying to load it if we don't need it (though
# all general pages will need it since it's used to render the css urls)
def PGWebContextProcessor(request):
- gitrev = SimpleLazyObject(_get_gitrev)
- return {
- 'gitrev': gitrev,
- }
+ gitrev = SimpleLazyObject(_get_gitrev)
+ return {
+ 'gitrev': gitrev,
+ }
import psycopg2
def scan_message(messageid, olddate, curs):
- u = "http://archives.postgresql.org/msgtxt.php?id=%s" % messageid
- print("Scanning message at %s (date reported as %s)..." % (u, olddate))
+ u = "http://archives.postgresql.org/msgtxt.php?id=%s" % messageid
+ print("Scanning message at %s (date reported as %s)..." % (u, olddate))
- f = urlopen(u)
- p = Parser()
- msg = p.parse(f)
- f.close()
+ f = urlopen(u)
+ p = Parser()
+ msg = p.parse(f)
+ f.close()
- # Can be either one of them, but we really don't care...
- ds = None
- for k,r in list(msg.items()):
- if k != 'Received': continue
+ # Can be either one of them, but we really don't care...
+ ds = None
+ for k,r in list(msg.items()):
+ if k != 'Received': continue
- print("Trying on %s" % r)
- m = re.search(';\s*(.*)$', r)
- if m:
- ds = m.group(1)
- break
- m = re.search(';\s*(.*)\s*\(envelope-from [^\)]+\)$', r)
- if m:
- ds = m.group(1)
- break
+ print("Trying on %s" % r)
+ m = re.search(';\s*(.*)$', r)
+ if m:
+ ds = m.group(1)
+ break
+ m = re.search(';\s*(.*)\s*\(envelope-from [^\)]+\)$', r)
+ if m:
+ ds = m.group(1)
+ break
- if not ds:
- print("Could not find date. Sorry.")
- return False
- d = None
- try:
- d = dateutil.parser.parse(ds)
- except:
- print("Could not parse date '%s', sorry." % ds)
- return
+ if not ds:
+ print("Could not find date. Sorry.")
+ return False
+ d = None
+ try:
+ d = dateutil.parser.parse(ds)
+ except:
+ print("Could not parse date '%s', sorry." % ds)
+ return
- while True:
- x = input("Parsed this as date %s. Update? " % d)
- if x.upper() == 'Y':
- curs.execute("UPDATE messages SET date=%(d)s WHERE messageid=%(m)s", {
- 'd': d,
- 'm': messageid,
- })
- print("Updated.")
- break
- elif x.upper() == 'N':
- break
-
+ while True:
+ x = input("Parsed this as date %s. Update? " % d)
+ if x.upper() == 'Y':
+ curs.execute("UPDATE messages SET date=%(d)s WHERE messageid=%(m)s", {
+ 'd': d,
+ 'm': messageid,
+ })
+ print("Updated.")
+ break
+ elif x.upper() == 'N':
+ break
+
if __name__ == "__main__":
- cfg = ConfigParser()
- cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
- connstr = cfg.get('db','connstr')
+ cfg = ConfigParser()
+ cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
+ connstr = cfg.get('db','connstr')
- conn = psycopg2.connect(connstr)
+ conn = psycopg2.connect(connstr)
- curs = conn.cursor()
- curs.execute("SELECT messageid, date FROM messages WHERE date>(CURRENT_TIMESTAMP+'1 day'::interval) OR date < '1997-01-01'")
- for messageid, date in curs.fetchall():
- scan_message(messageid, date, curs)
+ curs = conn.cursor()
+ curs.execute("SELECT messageid, date FROM messages WHERE date>(CURRENT_TIMESTAMP+'1 day'::interval) OR date < '1997-01-01'")
+ for messageid, date in curs.fetchall():
+ scan_message(messageid, date, curs)
- conn.commit()
- print("Done.")
+ conn.commit()
+ print("Done.")
def generate_single_mbox(conn, listid, year, month, destination):
- curs = conn.cursor()
- curs.execute("SELECT id, rawtxt FROM messages m INNER JOIN list_threads t ON t.threadid=m.threadid WHERE hiddenstatus IS NULL AND listid=%(listid)s AND date>=%(startdate)s AND date <= %(enddate)s ORDER BY date", {
- 'listid': listid,
- 'startdate': date(year, month, 1),
- 'enddate': date(year, month, calendar.monthrange(year, month)[1]),
- })
- with open(destination, 'w', encoding='utf8') as f:
- for id, raw, in curs:
- s = BytesIO(raw)
- parser = email.parser.BytesParser(policy=email.policy.compat32)
- msg = parser.parse(s)
- try:
- x = msg.as_string(unixfrom=True)
- f.write(x)
- except UnicodeEncodeError as e:
- print("Not including {0}, unicode error".format(msg['message-id']))
- except Exception as e:
- print("Not including {0}, exception {1}".format(msg['message-id'], e))
+ curs = conn.cursor()
+ curs.execute("SELECT id, rawtxt FROM messages m INNER JOIN list_threads t ON t.threadid=m.threadid WHERE hiddenstatus IS NULL AND listid=%(listid)s AND date>=%(startdate)s AND date <= %(enddate)s ORDER BY date", {
+ 'listid': listid,
+ 'startdate': date(year, month, 1),
+ 'enddate': date(year, month, calendar.monthrange(year, month)[1]),
+ })
+ with open(destination, 'w', encoding='utf8') as f:
+ for id, raw, in curs:
+ s = BytesIO(raw)
+ parser = email.parser.BytesParser(policy=email.policy.compat32)
+ msg = parser.parse(s)
+ try:
+ x = msg.as_string(unixfrom=True)
+ f.write(x)
+ except UnicodeEncodeError as e:
+ print("Not including {0}, unicode error".format(msg['message-id']))
+ except Exception as e:
+ print("Not including {0}, exception {1}".format(msg['message-id'], e))
if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Generate mbox file(s)")
- parser.add_argument('--list', type=str, help='List to generate for')
- parser.add_argument('--month', type=str, help='year-month to generate for, e.g. 2017-02')
- parser.add_argument('--destination', type=str, help='File to write into (or directory for --auto)', required=True)
- parser.add_argument('--auto', action='store_true', help='Auto-generate latest month mboxes for all lists')
- parser.add_argument('--quiet', action='store_true', help='Run quiet')
+ parser = argparse.ArgumentParser(description="Generate mbox file(s)")
+ parser.add_argument('--list', type=str, help='List to generate for')
+ parser.add_argument('--month', type=str, help='year-month to generate for, e.g. 2017-02')
+ parser.add_argument('--destination', type=str, help='File to write into (or directory for --auto)', required=True)
+ parser.add_argument('--auto', action='store_true', help='Auto-generate latest month mboxes for all lists')
+ parser.add_argument('--quiet', action='store_true', help='Run quiet')
- args = parser.parse_args()
+ args = parser.parse_args()
- if args.auto:
- if (args.list or args.month):
- print("Must not specify list and month when auto-generating!")
- sys.exit(1)
- if not os.path.isdir(args.destination):
- print("Destination must be a directory, and exist, when auto-generating")
- sys.exit(1)
- else:
- if not (args.list and args.month and args.destination):
- print("Must specify list, month and destination when generating a single mailbox")
- parser.print_help()
- sys.exit(1)
+ if args.auto:
+ if (args.list or args.month):
+ print("Must not specify list and month when auto-generating!")
+ sys.exit(1)
+ if not os.path.isdir(args.destination):
+ print("Destination must be a directory, and exist, when auto-generating")
+ sys.exit(1)
+ else:
+ if not (args.list and args.month and args.destination):
+ print("Must specify list, month and destination when generating a single mailbox")
+ parser.print_help()
+ sys.exit(1)
- # Arguments OK, now connect
- cfg = ConfigParser()
- cfg.read(os.path.join(os.path.realpath(os.path.dirname(sys.argv[0])), 'archives.ini'))
- try:
- connstr = cfg.get('db','connstr')
- except:
- connstr = 'need_connstr'
+ # Arguments OK, now connect
+ cfg = ConfigParser()
+ cfg.read(os.path.join(os.path.realpath(os.path.dirname(sys.argv[0])), 'archives.ini'))
+ try:
+ connstr = cfg.get('db','connstr')
+ except:
+ connstr = 'need_connstr'
- conn = psycopg2.connect(connstr)
- curs = conn.cursor()
+ conn = psycopg2.connect(connstr)
+ curs = conn.cursor()
- if args.auto:
- curs.execute("SELECT listid, listname FROM lists WHERE active ORDER BY listname")
- all_lists = curs.fetchall()
- today = date.today()
- yesterday = today - timedelta(days=1)
- if today.month == yesterday.month:
- # Same month, so do it
- monthrange = ((today.year, today.month),)
- else:
- monthrange = ((today.year, today.month),(yesterday.year, yesterday.month))
- for lid, lname in all_lists:
- for year, month in monthrange:
- fullpath = os.path.join(args.destination, lname, 'files/public/archive')
- if not os.path.isdir(fullpath):
- os.makedirs(fullpath)
- if not args.quiet:
- print("Generating {0}-{1} for {2}".format(year, month, lname))
- generate_single_mbox(conn, lid, year, month,
- os.path.join(fullpath, "{0}.{0:04d}{1:02d}".format(year, month)))
- else:
- # Parse year and month
- m = re.match('^(\d{4})-(\d{2})$', args.month)
- if not m:
- print("Month must be specified on format YYYY-MM, not {0}".format(args.month))
- sys.exit(1)
- year = int(m.group(1))
- month = int(m.group(2))
+ if args.auto:
+ curs.execute("SELECT listid, listname FROM lists WHERE active ORDER BY listname")
+ all_lists = curs.fetchall()
+ today = date.today()
+ yesterday = today - timedelta(days=1)
+ if today.month == yesterday.month:
+ # Same month, so do it
+ monthrange = ((today.year, today.month),)
+ else:
+ monthrange = ((today.year, today.month),(yesterday.year, yesterday.month))
+ for lid, lname in all_lists:
+ for year, month in monthrange:
+ fullpath = os.path.join(args.destination, lname, 'files/public/archive')
+ if not os.path.isdir(fullpath):
+ os.makedirs(fullpath)
+ if not args.quiet:
+ print("Generating {0}-{1} for {2}".format(year, month, lname))
+ generate_single_mbox(conn, lid, year, month,
+ os.path.join(fullpath, "{0}.{0:04d}{1:02d}".format(year, month)))
+ else:
+ # Parse year and month
+ m = re.match('^(\d{4})-(\d{2})$', args.month)
+ if not m:
+ print("Month must be specified on format YYYY-MM, not {0}".format(args.month))
+ sys.exit(1)
+ year = int(m.group(1))
+ month = int(m.group(2))
- curs.execute("SELECT listid FROM lists WHERE listname=%(name)s", {
- 'name': args.list,
- })
- if curs.rowcount != 1:
- print("List {0} not found.".format(args.list))
- sys.exit(1)
+ curs.execute("SELECT listid FROM lists WHERE listname=%(name)s", {
+ 'name': args.list,
+ })
+ if curs.rowcount != 1:
+ print("List {0} not found.".format(args.list))
+ sys.exit(1)
- if not args.quiet:
- print("Generating {0}-{1} for {2}".format(year, month, args.list))
- generate_single_mbox(conn, curs.fetchone()[0], year, month, args.destination)
+ if not args.quiet:
+ print("Generating {0}-{1} for {2}".format(year, month, args.list))
+ generate_single_mbox(conn, curs.fetchone()[0], year, month, args.destination)
from lib.varnish import VarnishPurger
reasons = [
- None, # Placeholder for 0
- "virus",
- "violates policies",
- "privacy",
- "corrupt",
+ None, # Placeholder for 0
+ "virus",
+ "violates policies",
+ "privacy",
+ "corrupt",
]
if __name__ == "__main__":
- optparser = OptionParser()
- optparser.add_option('-m', '--msgid', dest='msgid', help='Messageid to hide')
-
- (opt, args) = optparser.parse_args()
-
- if (len(args)):
- print("No bare arguments accepted")
- optparser.print_help()
- sys.exit(1)
-
- if not opt.msgid:
- print("Message-id must be specified")
- optparser.print_help()
- sys.exit(1)
-
- cfg = ConfigParser()
- cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
- try:
- connstr = cfg.get('db','connstr')
- except:
- connstr = 'need_connstr'
-
- conn = psycopg2.connect(connstr)
- curs = conn.cursor()
-
- curs.execute("SELECT id, threadid, hiddenstatus FROM messages WHERE messageid=%(msgid)s", {
- 'msgid': opt.msgid,
- })
- if curs.rowcount <= 0:
- print("Message not found.")
- sys.exit(1)
-
- id, threadid, previous = curs.fetchone()
-
- # Message found, ask for reason
- reason = 0
- print("Current status: %s" % reasons[previous or 0])
- print("\n".join("%s - %s " % (n, reasons[n]) for n in range(len(reasons))))
- while True:
- reason = input('Reason for hiding message? ')
- try:
- reason = int(reason)
- except ValueError:
- continue
-
- if reason == 0:
- print("Un-hiding message")
- reason = None
- break
- else:
- try:
- print("Hiding message for reason: %s" % reasons[reason])
- except:
- continue
- break
- if previous == reason:
- print("No change in status, not updating")
- conn.close()
- sys.exit(0)
-
- curs.execute("UPDATE messages SET hiddenstatus=%(new)s WHERE id=%(id)s", {
- 'new': reason,
- 'id': id,
- })
- if curs.rowcount != 1:
- print("Failed to update! Not hiding!")
- conn.rollback()
- sys.exit(0)
- conn.commit()
-
- VarnishPurger(cfg).purge([int(threadid), ])
- conn.close()
-
- print("Message hidden and varnish purge triggered.")
+ optparser = OptionParser()
+ optparser.add_option('-m', '--msgid', dest='msgid', help='Messageid to hide')
+
+ (opt, args) = optparser.parse_args()
+
+ if (len(args)):
+ print("No bare arguments accepted")
+ optparser.print_help()
+ sys.exit(1)
+
+ if not opt.msgid:
+ print("Message-id must be specified")
+ optparser.print_help()
+ sys.exit(1)
+
+ cfg = ConfigParser()
+ cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
+ try:
+ connstr = cfg.get('db','connstr')
+ except:
+ connstr = 'need_connstr'
+
+ conn = psycopg2.connect(connstr)
+ curs = conn.cursor()
+
+ curs.execute("SELECT id, threadid, hiddenstatus FROM messages WHERE messageid=%(msgid)s", {
+ 'msgid': opt.msgid,
+ })
+ if curs.rowcount <= 0:
+ print("Message not found.")
+ sys.exit(1)
+
+ id, threadid, previous = curs.fetchone()
+
+ # Message found, ask for reason
+ reason = 0
+ print("Current status: %s" % reasons[previous or 0])
+ print("\n".join("%s - %s " % (n, reasons[n]) for n in range(len(reasons))))
+ while True:
+ reason = input('Reason for hiding message? ')
+ try:
+ reason = int(reason)
+ except ValueError:
+ continue
+
+ if reason == 0:
+ print("Un-hiding message")
+ reason = None
+ break
+ else:
+ try:
+ print("Hiding message for reason: %s" % reasons[reason])
+ except:
+ continue
+ break
+ if previous == reason:
+ print("No change in status, not updating")
+ conn.close()
+ sys.exit(0)
+
+ curs.execute("UPDATE messages SET hiddenstatus=%(new)s WHERE id=%(id)s", {
+ 'new': reason,
+ 'id': id,
+ })
+ if curs.rowcount != 1:
+ print("Failed to update! Not hiding!")
+ conn.rollback()
+ sys.exit(0)
+ conn.commit()
+
+ VarnishPurger(cfg).purge([int(threadid), ])
+ conn.close()
+
+ print("Message hidden and varnish purge triggered.")
hp = HTMLParser()
def get_messageid(fn):
- with open(fn) as f:
- for l in f:
- if l.startswith('<!--X-Message-Id: '):
- # Found it!
- return hp.unescape(l[18:-5])
- raise Exception("No messageid in %s" % fn)
+ with open(fn) as f:
+ for l in f:
+ if l.startswith('<!--X-Message-Id: '):
+ # Found it!
+ return hp.unescape(l[18:-5])
+ raise Exception("No messageid in %s" % fn)
dirre = re.compile("^(\d+)-(\d+)$")
fnre = re.compile("^msg(\d+)\.php$")
for (dirpath, dirnames, filenames) in os.walk(root):
- # Dirpath is the full pathname
- base = os.path.basename(dirpath)
- m = dirre.match(base)
- if m:
- # Directory with actual files in it
- listname = os.path.basename(os.path.dirname(dirpath))
- for fn in filenames:
- m2 = fnre.match(fn)
- if m2:
- print "%s;%s;%s;%s;\"%s\"" % (listmap[listname], m.group(1), m.group(2), m2.group(1), get_messageid("%s/%s" % (dirpath, fn)))
+ # Dirpath is the full pathname
+ base = os.path.basename(dirpath)
+ m = dirre.match(base)
+ if m:
+ # Directory with actual files in it
+ listname = os.path.basename(os.path.dirname(dirpath))
+ for fn in filenames:
+ m2 = fnre.match(fn)
+ if m2:
+ print "%s;%s;%s;%s;\"%s\"" % (listmap[listname], m.group(1), m.group(2), m2.group(1), get_messageid("%s/%s" % (dirpath, fn)))
class IgnorableException(Exception):
- pass
+ pass
class Log(object):
- def __init__(self):
- self.verbose = False
+ def __init__(self):
+ self.verbose = False
- def set(self, verbose):
- self.verbose = verbose
+ def set(self, verbose):
+ self.verbose = verbose
- def status(self, msg):
- if self.verbose:
- print(msg)
+ def status(self, msg):
+ if self.verbose:
+ print(msg)
- def log(self, msg):
- print(msg)
+ def log(self, msg):
+ print(msg)
- def error(self, msg):
- print(msg)
+ def error(self, msg):
+ print(msg)
- def print_status(self):
- opstatus.print_status()
+ def print_status(self):
+ opstatus.print_status()
class OpStatus(object):
- def __init__(self):
- self.stored = 0
- self.dupes = 0
- self.tagged = 0
- self.failed = 0
- self.overwritten = 0
-
- def print_status(self):
- print("%s stored, %s new-list tagged, %s dupes, %s failed, %s overwritten" % (self.stored, self.tagged, self.dupes, self.failed, self.overwritten))
+ def __init__(self):
+ self.stored = 0
+ self.dupes = 0
+ self.tagged = 0
+ self.failed = 0
+ self.overwritten = 0
+
+ def print_status(self):
+ print("%s stored, %s new-list tagged, %s dupes, %s failed, %s overwritten" % (self.stored, self.tagged, self.dupes, self.failed, self.overwritten))
log = Log()
bSEPARATOR = bytes(SEPARATOR, 'ascii')
class MailboxBreakupParser(object):
- def __init__(self, fn):
- self.EOF = False
+ def __init__(self, fn):
+ self.EOF = False
- if fn.endswith(".gz"):
- cat = "zcat"
- else:
- cat = "cat"
- cmd = "%s %s | formail -s /bin/sh -c 'cat && echo %s'" % (cat, fn, SEPARATOR)
- self.pipe = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
+ if fn.endswith(".gz"):
+ cat = "zcat"
+ else:
+ cat = "cat"
+ cmd = "%s %s | formail -s /bin/sh -c 'cat && echo %s'" % (cat, fn, SEPARATOR)
+ self.pipe = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
- def returncode(self):
- self.pipe.wait()
- return self.pipe.returncode
+ def returncode(self):
+ self.pipe.wait()
+ return self.pipe.returncode
- def stderr_output(self):
- return self.pipe.stderr.read()
+ def stderr_output(self):
+ return self.pipe.stderr.read()
- def __next__(self):
- sio = BytesIO()
- while True:
- try:
- l = next(self.pipe.stdout)
- except StopIteration:
- # End of file!
- self.EOF = True
- if sio.tell() == 0:
- # Nothing read yet, so return None instead of an empty
- # bytesio
- return None
- sio.seek(0)
- return sio
- if l.rstrip() == bSEPARATOR:
- # Reached a separator. Meaning we're not at end of file,
- # but we're at end of message.
- sio.seek(0)
- return sio
- # Otherwise, append it to where we are now
- sio.write(l)
+ def __next__(self):
+ sio = BytesIO()
+ while True:
+ try:
+ l = next(self.pipe.stdout)
+ except StopIteration:
+ # End of file!
+ self.EOF = True
+ if sio.tell() == 0:
+ # Nothing read yet, so return None instead of an empty
+ # bytesio
+ return None
+ sio.seek(0)
+ return sio
+ if l.rstrip() == bSEPARATOR:
+ # Reached a separator. Meaning we're not at end of file,
+ # but we're at end of message.
+ sio.seek(0)
+ return sio
+ # Otherwise, append it to where we are now
+ sio.write(l)
from lib.log import log
class ArchivesParser(object):
- def __init__(self):
- self.parser = BytesParser(policy=compat32)
-
- def parse(self, stream):
- self.rawtxt = stream.read()
- self.msg = self.parser.parse(io.BytesIO(self.rawtxt))
-
- def is_msgid(self, msgid):
- # Look for a specific messageid. This means we might parse it twice,
- # but so be it. Any exception means we know it's not this one...
- try:
- if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
- return True
- except Exception as e:
- return False
-
- def analyze(self, date_override=None):
- self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID')))
- self._from = self.decode_mime_header(self.get_mandatory('From'), True)
- self.to = self.decode_mime_header(self.get_optional('To'), True)
- self.cc = self.decode_mime_header(self.get_optional('CC'), True)
- self.subject = self.decode_mime_header(self.get_optional('Subject'))
- if date_override:
- self.date = self.forgiving_date_decode(date_override)
- else:
- self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))
- self.bodytxt = self.get_body()
- self.attachments = []
- self.get_attachments()
- if len(self.attachments) > 0:
- log.status("Found %s attachments" % len(self.attachments))
-
- # Build an list of the message id's we are interested in
- self.parents = []
- # The first one is in-reply-to, if it exists
- if self.get_optional('in-reply-to'):
- m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
- if m:
- self.parents.append(m)
-
- # Then we add all References values, in backwards order
- if self.get_optional('references'):
- cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
- # Can't do this with a simple self.parents.extend() due to broken
- # mailers that add the same reference more than once. And we can't
- # use a set() to make it unique, because order is very important
- for m in cleaned_msgids:
- if m and not m in self.parents:
- self.parents.append(m)
-
-
- def clean_charset(self, charset):
- lcharset = charset.lower()
- if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
- # Special case where we don't know... We'll assume
- # us-ascii and use replacements
- return 'us-ascii'
- if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
- # Seriously broken charset definitions, map to us-ascii
- # and throw away the rest with replacements
- return 'us-ascii'
- if lcharset == 'x-gbk':
- # Some MUAs set it to x-gbk, but there is a valid
- # declaratoin as gbk...
- return 'gbk'
- if lcharset == 'iso-8859-8-i':
- # -I is a special logical version, but should be the
- # same charset
- return 'iso-8859-8'
- if lcharset == 'windows-874':
- # This is an alias for iso-8859-11
- return 'iso-8859-11'
- if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
- # Strange way of saying 8859....
- return 'iso-8859-1'
- if lcharset == 'iso885915':
- return 'iso-8859-15'
- if lcharset == 'iso-latin-2':
- return 'iso-8859-2'
- if lcharset == 'iso-850':
- # Strange spelling of cp850 (windows charset)
- return 'cp850'
- if lcharset == 'koi8r':
- return 'koi8-r'
- if lcharset == 'cp 1252':
- return 'cp1252'
- if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
- # Why did this show up more than once?!
- return 'iso-8859-1'
- if lcharset == 'x-windows-949':
- return 'ms949'
- if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
- # This is a locale, and not a charset, but most likely it's this one
- return 'iso-8859-1'
- if lcharset == 'iso-8858-15':
- # How is this a *common* mistake?
- return 'iso-8859-15'
- if lcharset == 'macintosh':
- return 'mac_roman'
- if lcharset == 'cn-big5':
- return 'big5'
- if lcharset == 'x-unicode-2-0-utf-7':
- return 'utf-7'
- if lcharset == 'tscii':
- # No support for this charset :S Map it down to ascii
- # and throw away all the rest. sucks, but we have to
- return 'us-ascii'
- return charset
-
- def get_payload_as_unicode(self, msg):
- try:
- b = msg.get_payload(decode=True)
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore it and hope for a better MIME part later.
- b = None
-
- if b:
- # Find out if there is a charset
- charset = None
- params = msg.get_params()
- if not params:
- # No content-type, so we assume us-ascii
- return str(b, 'us-ascii', errors='ignore')
- for k,v in params:
- if k.lower() == 'charset':
- charset = v
- break
- if charset:
- try:
- return str(b, self.clean_charset(charset), errors='ignore')
- except LookupError as e:
- raise IgnorableException("Failed to get unicode payload: %s" % e)
- else:
- # XXX: reasonable default?
- return str(b, errors='ignore')
- # Return None or empty string, depending on what we got back
- return b
-
- # Regular expression matching the PostgreSQL custom mail footer that
- # is appended to all emails.
- _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL)
- def get_body(self):
- b = self._get_body()
- if b:
- # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
- # later reject..
- if b.find('\udbff\n\udef8'):
- b = b.replace('\udbff\n\udef8', '')
-
- # Remove postgres specific mail footer - if it's there
- m = self._re_footer.match(b)
- if m:
- b = m.group(1)
-
- # Sometimes we end up with a trailing \0 when decoding long strings, so
- # replace it if it's there.
- # In fact, replace it everywhere, since it can also turn up in the middle
- # of a text when it's a really broken decoding.
- b = b.replace('\0', '')
-
- return b
-
- def _get_body(self):
- # This is where the magic happens - try to figure out what the body
- # of this message should render as.
- hasempty = False
-
- # First see if this is a single-part message that we can just
- # decode and go.
- b = self.get_payload_as_unicode(self.msg)
- if b: return b
- if b == '':
- # We found something, but it was empty. We'll keep looking as
- # there might be something better available, but make a note
- # that empty exists.
- hasempty = True
-
- # Ok, it's multipart. Find the first part that is text/plain,
- # and use that one. Do this recursively, since we may have something
- # like:
- # multipart/mixed:
- # multipart/alternative:
- # text/plain
- # text/html
- # application/octet-stream (attachment)
- b = self.recursive_first_plaintext(self.msg)
- if b: return b
- if b == '':
- hasempty = True
-
- # Couldn't find a plaintext. Look for the first HTML in that case.
- # Fallback, but what can we do at this point...
- b = self.recursive_first_plaintext(self.msg, True)
- if b:
- b = self.html_clean(b)
- if b: return b
- if b == '' or b is None:
- hasempty = True
-
- if hasempty:
- log.status('Found empty body in %s' % self.msgid)
- return ''
- raise IgnorableException("Don't know how to read the body from %s" % self.msgid)
-
- def recursive_first_plaintext(self, container, html_instead=False):
- pl = container.get_payload()
- if isinstance(pl, str):
- # This was not a multipart, but it leaked... Give up!
- return None
- for p in pl:
- if p.get_params() == None:
- # MIME multipart/mixed, but no MIME type on the part
- log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid)
- return self.get_payload_as_unicode(p)
- if p.get_params()[0][0].lower() == 'text/plain':
- # Don't include it if it looks like an attachment
- if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
- continue
- return self.get_payload_as_unicode(p)
- if html_instead and p.get_params()[0][0].lower() == 'text/html':
- # Don't include it if it looks like an attachment
- if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
- continue
- return self.get_payload_as_unicode(p)
- if p.is_multipart():
- b = self.recursive_first_plaintext(p, html_instead)
- if b or b == '': return b
-
- # Yikes, nothing here! Hopefully we'll find something when
- # we continue looping at a higher level.
- return None
-
- def get_attachments(self):
- self.attachments_found_first_plaintext = False
- self.recursive_get_attachments(self.msg)
-
- # Clean a filenames encoding and return it as a unicode string
- def _clean_filename_encoding(self, filename):
- # If this is a header-encoded filename, start by decoding that
- if filename.startswith('=?'):
- decoded, encoding = decode_header(filename)[0]
- return str(decoded, encoding, errors='ignore')
-
- # If it's already unicode, just return it
- if isinstance(filename, str):
- return filename
-
- # Anything that's not UTF8, we just get rid of. We can live with
- # filenames slightly mangled in this case.
- return str(filename, 'utf-8', errors='ignore')
-
- def _extract_filename(self, container):
- # Try to get the filename for an attachment in the container.
- # If the standard library can figure one out, use that one.
- f = container.get_filename()
- if f: return self._clean_filename_encoding(f)
-
- # Failing that, some mailers set Content-Description to the
- # filename
- if 'Content-Description' in container:
- return self._clean_filename_encoding(container['Content-Description'])
- return None
-
- def recursive_get_attachments(self, container):
- # We start recursion in the "multipart" container if any
- if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed':
- # Multipart - worth scanning into
- if not container.is_multipart():
- # Wow, this is broken. It's multipart/mixed, but doesn't
- # contain multiple parts.
- # Since we're just looking for attachments, let's just
- # ignore it...
- return
- for p in container.get_payload():
- if p.get_params() == None:
- continue
- self.recursive_get_attachments(p)
- elif container.get_content_type() == 'multipart/alternative':
- # Alternative is not an attachment (we decide)
- # It's typilcally plantext + html
- self.attachments_found_first_plaintext = True
- return
- elif container.is_multipart():
- # Other kinds of multipart, such as multipart/signed...
- return
- else:
- # Not a multipart.
- # Exclude specific contenttypes
- if container.get_content_type() == 'application/pgp-signature':
- return
- if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'):
- return
- # For now, accept anything not text/plain
- if container.get_content_type() != 'text/plain':
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
- return
-
- # It's a text/plain, it might be worthwhile.
- # If it has a name, we consider it an attachments
- if not container.get_params():
- return
- for k,v in container.get_params():
- if k=='name' and v != '':
- # Yes, it has a name
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- return
-
- # If it's content-disposition=attachment, we also want to save it
- if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'):
- try:
- self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- return
-
- # If we have already found one text/plain part, make all
- # further text/plain parts attachments
- if self.attachments_found_first_plaintext:
- # However, this will also *always* catch the MIME part added
- # by majordomo with the footer. So if that one is present,
- # we need to explicitly exclude it again.
- try:
- b = container.get_payload(decode=True)
- except AssertionError:
- # Badly encoded data can throw an exception here, where the python
- # libraries fail to handle it and enters a cannot-happen path.
- # In which case we just ignore this attachment.
- return
-
- if isinstance(b, str) and not self._re_footer.match(b):
- # We know there is no name for this one
- self.attachments.append((None, container.get_content_type(), b))
- return
-
- # Ok, so this was a plaintext that we ignored. Set the flag
- # that we have now ignored one, so we'll make the next one
- # an attachment.
- self.attachments_found_first_plaintext = True
- # No name, and text/plain, so ignore it
-
- re_msgid = re.compile('^\s*<(.*)>\s*')
- def clean_messageid(self, messageid, ignorebroken=False):
- m = self.re_msgid.match(messageid)
- if not m:
- if ignorebroken:
- log.status("Could not parse messageid '%s', ignoring it" % messageid)
- return None
- raise IgnorableException("Could not parse message id '%s'" % messageid)
- return m.groups(1)[0].replace(' ','')
-
-# _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
- # Now using [^\s] instead of \w, to work with japanese chars
- _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
- _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$')
- _date_multiminus_re = re.compile(' -(-\d+)$')
- _date_offsetnoplus_re = re.compile(' (\d{4})$')
- def forgiving_date_decode(self, d):
- if d.strip() == '':
- raise IgnorableException("Failed to parse empty date")
- # Strange timezones requiring manual adjustments
- if d.endswith('-7700 (EST)'):
- d = d.replace('-7700 (EST)', 'EST')
- if d.endswith('+6700 (EST)'):
- d = d.replace('+6700 (EST)', 'EST')
- if d.endswith('+-4-30'):
- d = d.replace('+-4-30', '+0430')
- if d.endswith('+1.00'):
- d = d.replace('+1.00', '+0100')
- if d.endswith('+-100'):
- d = d.replace('+-100', '+0100')
- if d.endswith('+500'):
- d = d.replace('+500', '+0500')
- if d.endswith('-500'):
- d = d.replace('-500', '-0500')
- if d.endswith('-700'):
- d = d.replace('-700', '-0700')
- if d.endswith('-800'):
- d = d.replace('-800', '-0800')
- if d.endswith('+05-30'):
- d = d.replace('+05-30', '+0530')
- if d.endswith('+0-900'):
- d = d.replace('+0-900', '-0900')
- if d.endswith('Mexico/General'):
- d = d.replace('Mexico/General','CDT')
- if d.endswith('Pacific Daylight Time'):
- d = d.replace('Pacific Daylight Time', 'PDT')
- if d.endswith(' ZE2'):
- d = d.replace(' ZE2',' +0200')
- if d.find('-Juin-') > 0:
- d = d.replace('-Juin-','-Jun-')
- if d.find('-Juil-') > 0:
- d = d.replace('-Juil-','-Jul-')
- if d.find(' 0 (GMT)'):
- d = d.replace(' 0 (GMT)',' +0000')
-
- if self._date_multiminus_re.search(d):
- d = self._date_multiminus_re.sub(' \\1', d)
-
- if self._date_offsetnoplus_re.search(d):
- d = self._date_offsetnoplus_re.sub('+\\1', d)
-
-
- # We have a number of dates in the format
- # "<full datespace> +0200 (MET DST)"
- # or similar. The problem coming from the space within the
- # parenthesis, or if the contents of the parenthesis is
- # completely empty
- if self._date_multi_re.search(d):
- d = self._date_multi_re.sub('', d)
-
- # If the spec is instead
- # "<full datespace> +0200 (...)"
- # of any kind, we can just remove what's in the (), because the
- # parser is just going to rely on the fixed offset anyway.
- if self._date_multi_re2.search(d):
- d = self._date_multi_re2.sub(' \\1', d)
-
- try:
- dp = dateutil.parser.parse(d, fuzzy=True)
-
- # Some offsets are >16 hours, which postgresql will not
- # (for good reasons) accept
- if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1:
- # Convert it to a UTC timestamp using Python. It will give
- # us the right time, but the wrong timezone. Should be
- # enough...
- dp = datetime.datetime(*dp.utctimetuple()[:6])
- return dp
- except Exception as e:
- raise IgnorableException("Failed to parse date '%s': %s" % (d, e))
-
- def _maybe_decode(self, s, charset):
- if isinstance(s, str):
- return s.strip(' ')
- return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')
-
- # Workaround for broken quoting in some MUAs (see below)
- _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)
- def _decode_mime_header(self, hdr, email_workaround):
- if hdr == None:
- return None
-
- # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
- # we must get rid of the sequence \n\t at least in the header. If we
- # do this *before* doing any MIME decoding, we should be safe against
- # anybody *actually* putting that sequence in the header (since we
- # won't match the encoded contents)
- hdr = hdr.replace("\n\t"," ")
-
- # In at least some cases, at least gmail (and possibly other MUAs)
- # incorrectly put double quotes in the name/email field even when
- # it's encoded. That's not allowed - they have to be escaped - but
- # since there's a fair amount of those, we apply a regex to get
- # rid of them.
- m = self._re_mailworkaround.search(hdr)
- if m:
- hdr = self._re_mailworkaround.sub(r'\1', hdr)
-
- try:
- return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
- except HeaderParseError as e:
- # Parser error is typically someone specifying an encoding,
- # but then not actually using that encoding. We'll do the best
- # we can, which is cut it down to ascii and ignore errors
- return str(hdr, 'us-ascii', errors='ignore').strip(' ')
-
- def decode_mime_header(self, hdr, email_workaround=False):
- try:
- if isinstance(hdr, Header):
- hdr = hdr.encode()
-
- h = self._decode_mime_header(hdr, email_workaround)
- if h:
- return h.replace("\0", "")
- return ''
- except LookupError as e:
- raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
- except ValueError as ve:
- raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))
-
- def get_mandatory(self, fieldname):
- try:
- x = self.msg[fieldname]
- if x==None:
- raise Exception()
- return x
- except:
- raise IgnorableException("Mandatory field '%s' is missing" % fieldname)
-
- def get_optional(self, fieldname):
- try:
- return self.msg[fieldname]
- except:
- return ''
-
- def html_clean(self, html):
- # First we pass it through tidy
- (html, errors) = tidylib.tidy_document(html,
- options={
- 'drop-proprietary-attributes': 1,
- 'alt-text': '',
- 'hide-comments': 1,
- 'output-xhtml': 1,
- 'show-body-only': 1,
- 'clean': 1,
- 'char-encoding': 'utf8',
- 'show-warnings': 0,
- 'show-info': 0,
- })
- if errors:
- print(("HTML tidy failed for %s!" % self.msgid))
- print(errors)
- return None
-
- try:
- cleaner = HTMLCleaner()
- cleaner.feed(html)
- return cleaner.get_text()
- except Exception as e:
- # Failed to parse the html, thus failed to clean it. so we must
- # give up...
- return None
+ def __init__(self):
+ self.parser = BytesParser(policy=compat32)
+
+ def parse(self, stream):
+ self.rawtxt = stream.read()
+ self.msg = self.parser.parse(io.BytesIO(self.rawtxt))
+
+ def is_msgid(self, msgid):
+ # Look for a specific messageid. This means we might parse it twice,
+ # but so be it. Any exception means we know it's not this one...
+ try:
+ if self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID'))) == msgid:
+ return True
+ except Exception as e:
+ return False
+
+ def analyze(self, date_override=None):
+ self.msgid = self.clean_messageid(self.decode_mime_header(self.get_mandatory('Message-ID')))
+ self._from = self.decode_mime_header(self.get_mandatory('From'), True)
+ self.to = self.decode_mime_header(self.get_optional('To'), True)
+ self.cc = self.decode_mime_header(self.get_optional('CC'), True)
+ self.subject = self.decode_mime_header(self.get_optional('Subject'))
+ if date_override:
+ self.date = self.forgiving_date_decode(date_override)
+ else:
+ self.date = self.forgiving_date_decode(self.decode_mime_header(self.get_mandatory('Date')))
+ self.bodytxt = self.get_body()
+ self.attachments = []
+ self.get_attachments()
+ if len(self.attachments) > 0:
+ log.status("Found %s attachments" % len(self.attachments))
+
+ # Build an list of the message id's we are interested in
+ self.parents = []
+ # The first one is in-reply-to, if it exists
+ if self.get_optional('in-reply-to'):
+ m = self.clean_messageid(self.decode_mime_header(self.get_optional('in-reply-to')), True)
+ if m:
+ self.parents.append(m)
+
+ # Then we add all References values, in backwards order
+ if self.get_optional('references'):
+ cleaned_msgids = [self.clean_messageid(x, True) for x in reversed(self.decode_mime_header(self.get_optional('references')).split())]
+ # Can't do this with a simple self.parents.extend() due to broken
+ # mailers that add the same reference more than once. And we can't
+ # use a set() to make it unique, because order is very important
+ for m in cleaned_msgids:
+ if m and not m in self.parents:
+ self.parents.append(m)
+
+
+ def clean_charset(self, charset):
+ lcharset = charset.lower()
+ if lcharset == 'unknown-8bit' or lcharset == 'x-unknown' or lcharset == 'unknown':
+ # Special case where we don't know... We'll assume
+ # us-ascii and use replacements
+ return 'us-ascii'
+ if lcharset == '0' or lcharset == 'x-user-defined' or lcharset == '_autodetect_all' or lcharset == 'default_charset':
+ # Seriously broken charset definitions, map to us-ascii
+ # and throw away the rest with replacements
+ return 'us-ascii'
+ if lcharset == 'x-gbk':
+ # Some MUAs set it to x-gbk, but there is a valid
+ # declaratoin as gbk...
+ return 'gbk'
+ if lcharset == 'iso-8859-8-i':
+ # -I is a special logical version, but should be the
+ # same charset
+ return 'iso-8859-8'
+ if lcharset == 'windows-874':
+ # This is an alias for iso-8859-11
+ return 'iso-8859-11'
+ if lcharset == 'iso-88-59-1' or lcharset == 'iso-8858-1':
+ # Strange way of saying 8859....
+ return 'iso-8859-1'
+ if lcharset == 'iso885915':
+ return 'iso-8859-15'
+ if lcharset == 'iso-latin-2':
+ return 'iso-8859-2'
+ if lcharset == 'iso-850':
+ # Strange spelling of cp850 (windows charset)
+ return 'cp850'
+ if lcharset == 'koi8r':
+ return 'koi8-r'
+ if lcharset == 'cp 1252':
+ return 'cp1252'
+ if lcharset == 'iso-8859-1,iso-8859-2' or lcharset == 'iso-8859-1:utf8:us-ascii':
+ # Why did this show up more than once?!
+ return 'iso-8859-1'
+ if lcharset == 'x-windows-949':
+ return 'ms949'
+ if lcharset == 'pt_pt' or lcharset == 'de_latin' or lcharset == 'de':
+ # This is a locale, and not a charset, but most likely it's this one
+ return 'iso-8859-1'
+ if lcharset == 'iso-8858-15':
+ # How is this a *common* mistake?
+ return 'iso-8859-15'
+ if lcharset == 'macintosh':
+ return 'mac_roman'
+ if lcharset == 'cn-big5':
+ return 'big5'
+ if lcharset == 'x-unicode-2-0-utf-7':
+ return 'utf-7'
+ if lcharset == 'tscii':
+ # No support for this charset :S Map it down to ascii
+ # and throw away all the rest. sucks, but we have to
+ return 'us-ascii'
+ return charset
+
+ def get_payload_as_unicode(self, msg):
+ try:
+ b = msg.get_payload(decode=True)
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore it and hope for a better MIME part later.
+ b = None
+
+ if b:
+ # Find out if there is a charset
+ charset = None
+ params = msg.get_params()
+ if not params:
+ # No content-type, so we assume us-ascii
+ return str(b, 'us-ascii', errors='ignore')
+ for k,v in params:
+ if k.lower() == 'charset':
+ charset = v
+ break
+ if charset:
+ try:
+ return str(b, self.clean_charset(charset), errors='ignore')
+ except LookupError as e:
+ raise IgnorableException("Failed to get unicode payload: %s" % e)
+ else:
+ # XXX: reasonable default?
+ return str(b, errors='ignore')
+ # Return None or empty string, depending on what we got back
+ return b
+
+ # Regular expression matching the PostgreSQL custom mail footer that
+ # is appended to all emails.
+ _re_footer = re.compile('(.*)--\s+\nSent via [^\s]+ mailing list \([^\)]+\)\nTo make changes to your subscription:\nhttp://www\.postgresql\.org/mailpref/[^\s]+\s*$', re.DOTALL)
+ def get_body(self):
+ b = self._get_body()
+ if b:
+ # Python bug 9133, allows unicode surrogate pairs - which PostgreSQL will
+ # later reject..
+ if b.find('\udbff\n\udef8'):
+ b = b.replace('\udbff\n\udef8', '')
+
+ # Remove postgres specific mail footer - if it's there
+ m = self._re_footer.match(b)
+ if m:
+ b = m.group(1)
+
+ # Sometimes we end up with a trailing \0 when decoding long strings, so
+ # replace it if it's there.
+ # In fact, replace it everywhere, since it can also turn up in the middle
+ # of a text when it's a really broken decoding.
+ b = b.replace('\0', '')
+
+ return b
+
+ def _get_body(self):
+ # This is where the magic happens - try to figure out what the body
+ # of this message should render as.
+ hasempty = False
+
+ # First see if this is a single-part message that we can just
+ # decode and go.
+ b = self.get_payload_as_unicode(self.msg)
+ if b: return b
+ if b == '':
+ # We found something, but it was empty. We'll keep looking as
+ # there might be something better available, but make a note
+ # that empty exists.
+ hasempty = True
+
+ # Ok, it's multipart. Find the first part that is text/plain,
+ # and use that one. Do this recursively, since we may have something
+ # like:
+ # multipart/mixed:
+ # multipart/alternative:
+ # text/plain
+ # text/html
+ # application/octet-stream (attachment)
+ b = self.recursive_first_plaintext(self.msg)
+ if b: return b
+ if b == '':
+ hasempty = True
+
+ # Couldn't find a plaintext. Look for the first HTML in that case.
+ # Fallback, but what can we do at this point...
+ b = self.recursive_first_plaintext(self.msg, True)
+ if b:
+ b = self.html_clean(b)
+ if b: return b
+ if b == '' or b is None:
+ hasempty = True
+
+ if hasempty:
+ log.status('Found empty body in %s' % self.msgid)
+ return ''
+ raise IgnorableException("Don't know how to read the body from %s" % self.msgid)
+
+ def recursive_first_plaintext(self, container, html_instead=False):
+ pl = container.get_payload()
+ if isinstance(pl, str):
+ # This was not a multipart, but it leaked... Give up!
+ return None
+ for p in pl:
+ if p.get_params() == None:
+ # MIME multipart/mixed, but no MIME type on the part
+ log.status("Found multipart/mixed in message '%s', but no MIME type on part. Trying text/plain." % self.msgid)
+ return self.get_payload_as_unicode(p)
+ if p.get_params()[0][0].lower() == 'text/plain':
+ # Don't include it if it looks like an attachment
+ if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
+ continue
+ return self.get_payload_as_unicode(p)
+ if html_instead and p.get_params()[0][0].lower() == 'text/html':
+ # Don't include it if it looks like an attachment
+ if 'Content-Disposition' in p and p['Content-Disposition'].startswith('attachment'):
+ continue
+ return self.get_payload_as_unicode(p)
+ if p.is_multipart():
+ b = self.recursive_first_plaintext(p, html_instead)
+ if b or b == '': return b
+
+ # Yikes, nothing here! Hopefully we'll find something when
+ # we continue looping at a higher level.
+ return None
+
+ def get_attachments(self):
+ self.attachments_found_first_plaintext = False
+ self.recursive_get_attachments(self.msg)
+
+ # Clean a filenames encoding and return it as a unicode string
+ def _clean_filename_encoding(self, filename):
+ # If this is a header-encoded filename, start by decoding that
+ if filename.startswith('=?'):
+ decoded, encoding = decode_header(filename)[0]
+ return str(decoded, encoding, errors='ignore')
+
+ # If it's already unicode, just return it
+ if isinstance(filename, str):
+ return filename
+
+ # Anything that's not UTF8, we just get rid of. We can live with
+ # filenames slightly mangled in this case.
+ return str(filename, 'utf-8', errors='ignore')
+
+ def _extract_filename(self, container):
+ # Try to get the filename for an attachment in the container.
+ # If the standard library can figure one out, use that one.
+ f = container.get_filename()
+ if f: return self._clean_filename_encoding(f)
+
+ # Failing that, some mailers set Content-Description to the
+ # filename
+ if 'Content-Description' in container:
+ return self._clean_filename_encoding(container['Content-Description'])
+ return None
+
+ def recursive_get_attachments(self, container):
+ # We start recursion in the "multipart" container if any
+ if container.get_content_type() == 'multipart/mixed' or container.get_content_type() == 'multipart/signed':
+ # Multipart - worth scanning into
+ if not container.is_multipart():
+ # Wow, this is broken. It's multipart/mixed, but doesn't
+ # contain multiple parts.
+ # Since we're just looking for attachments, let's just
+ # ignore it...
+ return
+ for p in container.get_payload():
+ if p.get_params() == None:
+ continue
+ self.recursive_get_attachments(p)
+ elif container.get_content_type() == 'multipart/alternative':
+ # Alternative is not an attachment (we decide)
+ # It's typilcally plantext + html
+ self.attachments_found_first_plaintext = True
+ return
+ elif container.is_multipart():
+ # Other kinds of multipart, such as multipart/signed...
+ return
+ else:
+ # Not a multipart.
+ # Exclude specific contenttypes
+ if container.get_content_type() == 'application/pgp-signature':
+ return
+ if container.get_content_type() in ('application/pkcs7-signature', 'application/x-pkcs7-signature'):
+ return
+ # For now, accept anything not text/plain
+ if container.get_content_type() != 'text/plain':
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+ return
+
+ # It's a text/plain, it might be worthwhile.
+ # If it has a name, we consider it an attachments
+ if not container.get_params():
+ return
+ for k,v in container.get_params():
+ if k=='name' and v != '':
+ # Yes, it has a name
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ return
+
+ # If it's content-disposition=attachment, we also want to save it
+ if 'Content-Disposition' in container and container['Content-Disposition'].startswith('attachment'):
+ try:
+ self.attachments.append((self._extract_filename(container), container.get_content_type(), container.get_payload(decode=True)))
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ return
+
+ # If we have already found one text/plain part, make all
+ # further text/plain parts attachments
+ if self.attachments_found_first_plaintext:
+ # However, this will also *always* catch the MIME part added
+ # by majordomo with the footer. So if that one is present,
+ # we need to explicitly exclude it again.
+ try:
+ b = container.get_payload(decode=True)
+ except AssertionError:
+ # Badly encoded data can throw an exception here, where the python
+ # libraries fail to handle it and enters a cannot-happen path.
+ # In which case we just ignore this attachment.
+ return
+
+ if isinstance(b, str) and not self._re_footer.match(b):
+ # We know there is no name for this one
+ self.attachments.append((None, container.get_content_type(), b))
+ return
+
+ # Ok, so this was a plaintext that we ignored. Set the flag
+ # that we have now ignored one, so we'll make the next one
+ # an attachment.
+ self.attachments_found_first_plaintext = True
+ # No name, and text/plain, so ignore it
+
+ re_msgid = re.compile('^\s*<(.*)>\s*')
+ def clean_messageid(self, messageid, ignorebroken=False):
+ m = self.re_msgid.match(messageid)
+ if not m:
+ if ignorebroken:
+ log.status("Could not parse messageid '%s', ignoring it" % messageid)
+ return None
+ raise IgnorableException("Could not parse message id '%s'" % messageid)
+ return m.groups(1)[0].replace(' ','')
+
+# _date_multi_re = re.compile(' \((\w+\s\w+(\s+\w+)*|)\)$')
+ # Now using [^\s] instead of \w, to work with japanese chars
+ _date_multi_re = re.compile(' \(([^\s]+\s[^\s]+(\s+[^\s]+)*|)\)$')
+ _date_multi_re2 = re.compile(' ([\+-]\d{4}) \([^)]+\)$')
+ _date_multiminus_re = re.compile(' -(-\d+)$')
+ _date_offsetnoplus_re = re.compile(' (\d{4})$')
+ def forgiving_date_decode(self, d):
+ if d.strip() == '':
+ raise IgnorableException("Failed to parse empty date")
+ # Strange timezones requiring manual adjustments
+ if d.endswith('-7700 (EST)'):
+ d = d.replace('-7700 (EST)', 'EST')
+ if d.endswith('+6700 (EST)'):
+ d = d.replace('+6700 (EST)', 'EST')
+ if d.endswith('+-4-30'):
+ d = d.replace('+-4-30', '+0430')
+ if d.endswith('+1.00'):
+ d = d.replace('+1.00', '+0100')
+ if d.endswith('+-100'):
+ d = d.replace('+-100', '+0100')
+ if d.endswith('+500'):
+ d = d.replace('+500', '+0500')
+ if d.endswith('-500'):
+ d = d.replace('-500', '-0500')
+ if d.endswith('-700'):
+ d = d.replace('-700', '-0700')
+ if d.endswith('-800'):
+ d = d.replace('-800', '-0800')
+ if d.endswith('+05-30'):
+ d = d.replace('+05-30', '+0530')
+ if d.endswith('+0-900'):
+ d = d.replace('+0-900', '-0900')
+ if d.endswith('Mexico/General'):
+ d = d.replace('Mexico/General','CDT')
+ if d.endswith('Pacific Daylight Time'):
+ d = d.replace('Pacific Daylight Time', 'PDT')
+ if d.endswith(' ZE2'):
+ d = d.replace(' ZE2',' +0200')
+ if d.find('-Juin-') > 0:
+ d = d.replace('-Juin-','-Jun-')
+ if d.find('-Juil-') > 0:
+ d = d.replace('-Juil-','-Jul-')
+ if d.find(' 0 (GMT)'):
+ d = d.replace(' 0 (GMT)',' +0000')
+
+ if self._date_multiminus_re.search(d):
+ d = self._date_multiminus_re.sub(' \\1', d)
+
+ if self._date_offsetnoplus_re.search(d):
+ d = self._date_offsetnoplus_re.sub('+\\1', d)
+
+
+ # We have a number of dates in the format
+ # "<full datespace> +0200 (MET DST)"
+ # or similar. The problem coming from the space within the
+ # parenthesis, or if the contents of the parenthesis is
+ # completely empty
+ if self._date_multi_re.search(d):
+ d = self._date_multi_re.sub('', d)
+
+ # If the spec is instead
+ # "<full datespace> +0200 (...)"
+ # of any kind, we can just remove what's in the (), because the
+ # parser is just going to rely on the fixed offset anyway.
+ if self._date_multi_re2.search(d):
+ d = self._date_multi_re2.sub(' \\1', d)
+
+ try:
+ dp = dateutil.parser.parse(d, fuzzy=True)
+
+ # Some offsets are >16 hours, which postgresql will not
+ # (for good reasons) accept
+ if dp.utcoffset() and abs(dp.utcoffset().days * (24*60*60) + dp.utcoffset().seconds) > 60*60*16-1:
+ # Convert it to a UTC timestamp using Python. It will give
+ # us the right time, but the wrong timezone. Should be
+ # enough...
+ dp = datetime.datetime(*dp.utctimetuple()[:6])
+ return dp
+ except Exception as e:
+ raise IgnorableException("Failed to parse date '%s': %s" % (d, e))
+
+ def _maybe_decode(self, s, charset):
+ if isinstance(s, str):
+ return s.strip(' ')
+ return str(s, charset and self.clean_charset(charset) or 'us-ascii', errors='ignore').strip(' ')
+
+ # Workaround for broken quoting in some MUAs (see below)
+ _re_mailworkaround = re.compile('"(=\?[^\?]+\?[QB]\?[^\?]+\?=)"', re.IGNORECASE)
+ def _decode_mime_header(self, hdr, email_workaround):
+ if hdr == None:
+ return None
+
+ # Per http://bugs.python.org/issue504152 (and lots of testing), it seems
+ # we must get rid of the sequence \n\t at least in the header. If we
+ # do this *before* doing any MIME decoding, we should be safe against
+ # anybody *actually* putting that sequence in the header (since we
+ # won't match the encoded contents)
+ hdr = hdr.replace("\n\t"," ")
+
+ # In at least some cases, at least gmail (and possibly other MUAs)
+ # incorrectly put double quotes in the name/email field even when
+ # it's encoded. That's not allowed - they have to be escaped - but
+ # since there's a fair amount of those, we apply a regex to get
+ # rid of them.
+ m = self._re_mailworkaround.search(hdr)
+ if m:
+ hdr = self._re_mailworkaround.sub(r'\1', hdr)
+
+ try:
+ return " ".join([self._maybe_decode(s, charset) for s, charset in decode_header(hdr)])
+ except HeaderParseError as e:
+ # Parser error is typically someone specifying an encoding,
+ # but then not actually using that encoding. We'll do the best
+ # we can, which is cut it down to ascii and ignore errors
+ return str(hdr, 'us-ascii', errors='ignore').strip(' ')
+
+ def decode_mime_header(self, hdr, email_workaround=False):
+ try:
+ if isinstance(hdr, Header):
+ hdr = hdr.encode()
+
+ h = self._decode_mime_header(hdr, email_workaround)
+ if h:
+ return h.replace("\0", "")
+ return ''
+ except LookupError as e:
+ raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, e))
+ except ValueError as ve:
+ raise IgnorableException("Failed to decode header value '%s': %s" % (hdr, ve))
+
+ def get_mandatory(self, fieldname):
+ try:
+ x = self.msg[fieldname]
+ if x==None:
+ raise Exception()
+ return x
+ except:
+ raise IgnorableException("Mandatory field '%s' is missing" % fieldname)
+
+ def get_optional(self, fieldname):
+ try:
+ return self.msg[fieldname]
+ except:
+ return ''
+
+ def html_clean(self, html):
+ # First we pass it through tidy
+ (html, errors) = tidylib.tidy_document(html,
+ options={
+ 'drop-proprietary-attributes': 1,
+ 'alt-text': '',
+ 'hide-comments': 1,
+ 'output-xhtml': 1,
+ 'show-body-only': 1,
+ 'clean': 1,
+ 'char-encoding': 'utf8',
+ 'show-warnings': 0,
+ 'show-info': 0,
+ })
+ if errors:
+ print(("HTML tidy failed for %s!" % self.msgid))
+ print(errors)
+ return None
+
+ try:
+ cleaner = HTMLCleaner()
+ cleaner.feed(html)
+ return cleaner.get_text()
+ except Exception as e:
+ # Failed to parse the html, thus failed to clean it. so we must
+ # give up...
+ return None
class HTMLCleaner(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.io = io.StringIO()
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.io = io.StringIO()
- def get_text(self):
- return self.io.getvalue()
+ def get_text(self):
+ return self.io.getvalue()
- def handle_data(self, data):
- self.io.write(data)
+ def handle_data(self, data):
+ self.io.write(data)
- def handle_starttag(self, tag, attrs):
- if tag == "p" or tag == "br":
- self.io.write("\n")
+ def handle_starttag(self, tag, attrs):
+ if tag == "p" or tag == "br":
+ self.io.write("\n")
from lib.log import log, opstatus
class ArchivesParserStorage(ArchivesParser):
- def __init__(self):
- super(ArchivesParserStorage, self).__init__()
- self.purges = set()
+ def __init__(self):
+ super(ArchivesParserStorage, self).__init__()
+ self.purges = set()
- def purge_list(self, listid, year, month):
- self.purges.add((int(listid), int(year), int(month)))
+ def purge_list(self, listid, year, month):
+ self.purges.add((int(listid), int(year), int(month)))
- def purge_thread(self, threadid):
- self.purges.add(int(threadid))
+ def purge_thread(self, threadid):
+ self.purges.add(int(threadid))
- def store(self, conn, listid, overwrite=False):
- curs = conn.cursor()
+ def store(self, conn, listid, overwrite=False):
+ curs = conn.cursor()
- # Potentially add the information that there exists a mail for
- # this month. We do that this early since we're always going to
- # make the check anyway, and this keeps the code in one place..
- if not overwrite:
- curs.execute("INSERT INTO list_months (listid, year, month) SELECT %(listid)s, %(year)s, %(month)s WHERE NOT EXISTS (SELECT listid FROM list_months WHERE listid=%(listid)s AND year=%(year)s AND month=%(month)s)", {
- 'listid': listid,
- 'year': self.date.year,
- 'month': self.date.month,
- })
+ # Potentially add the information that there exists a mail for
+ # this month. We do that this early since we're always going to
+ # make the check anyway, and this keeps the code in one place..
+ if not overwrite:
+ curs.execute("INSERT INTO list_months (listid, year, month) SELECT %(listid)s, %(year)s, %(month)s WHERE NOT EXISTS (SELECT listid FROM list_months WHERE listid=%(listid)s AND year=%(year)s AND month=%(month)s)", {
+ 'listid': listid,
+ 'year': self.date.year,
+ 'month': self.date.month,
+ })
- curs.execute("SELECT threadid, EXISTS(SELECT threadid FROM list_threads lt WHERE lt.listid=%(listid)s AND lt.threadid=m.threadid), id FROM messages m WHERE m.messageid=%(messageid)s", {
- 'messageid': self.msgid,
- 'listid': listid,
- })
- r = curs.fetchall()
- if len(r) > 0:
- # Has to be 1 row, since we have a unique index on id
- if not r[0][1] and not overwrite:
- log.status("Tagging message %s with list %s" % (self.msgid, listid))
- curs.execute("INSERT INTO list_threads (threadid, listid) VALUES (%(threadid)s, %(listid)s)", {
- 'threadid': r[0][0],
- 'listid': listid,
- })
- opstatus.tagged += 1
- self.purge_list(listid, self.date.year, self.date.month)
- self.purge_thread(r[0][0])
- else:
- opstatus.dupes += 1
+ curs.execute("SELECT threadid, EXISTS(SELECT threadid FROM list_threads lt WHERE lt.listid=%(listid)s AND lt.threadid=m.threadid), id FROM messages m WHERE m.messageid=%(messageid)s", {
+ 'messageid': self.msgid,
+ 'listid': listid,
+ })
+ r = curs.fetchall()
+ if len(r) > 0:
+ # Has to be 1 row, since we have a unique index on id
+ if not r[0][1] and not overwrite:
+ log.status("Tagging message %s with list %s" % (self.msgid, listid))
+ curs.execute("INSERT INTO list_threads (threadid, listid) VALUES (%(threadid)s, %(listid)s)", {
+ 'threadid': r[0][0],
+ 'listid': listid,
+ })
+ opstatus.tagged += 1
+ self.purge_list(listid, self.date.year, self.date.month)
+ self.purge_thread(r[0][0])
+ else:
+ opstatus.dupes += 1
- if overwrite:
- pk = r[0][2]
- self.purge_thread(r[0][0])
- # Overwrite an existing message. We do not attempt to
- # "re-thread" a message, we just update the contents. We
- # do remove all attachments and rewrite them. Of course, we
- # don't change the messageid (since it's our primary
- # identifyer), and we don't update the raw text of the message.
- # (since we are expected to have used that raw text to do
- # the re-parsing initially)
- # We update bodytext as a separate step so as not to rewrite
- # the TOAST table unnecessarily...
- curs.execute("UPDATE messages SET bodytxt=%(bodytxt)s WHERE id=%(id)s AND NOT (bodytxt=%(bodytxt)s) RETURNING id", {
- 'id': pk,
- 'bodytxt': self.bodytxt,
- })
- rc = curs.rowcount
- curs.execute("UPDATE messages SET _from=%(from)s, _to=%(to)s, cc=%(cc)s, subject=%(subject)s, date=%(date)s, has_attachment=%(has_attachment)s WHERE id=%(id)s AND NOT (_from=%(from)s AND _to=%(to)s AND cc=%(cc)s AND subject=%(subject)s AND date=%(date)s AND has_attachment=%(has_attachment)s) RETURNING id", {
- 'id': pk,
- 'from': self._from,
- 'to': self.to or '',
- 'cc': self.cc or '',
- 'subject': self.subject or '',
- 'date': self.date,
- 'has_attachment': len(self.attachments) > 0,
- })
- rc += curs.rowcount
- if rc == 0:
- log.status("Message %s unchanged" % self.msgid)
- return False
+ if overwrite:
+ pk = r[0][2]
+ self.purge_thread(r[0][0])
+ # Overwrite an existing message. We do not attempt to
+ # "re-thread" a message, we just update the contents. We
+ # do remove all attachments and rewrite them. Of course, we
+ # don't change the messageid (since it's our primary
+ # identifyer), and we don't update the raw text of the message.
+ # (since we are expected to have used that raw text to do
+ # the re-parsing initially)
+ # We update bodytext as a separate step so as not to rewrite
+ # the TOAST table unnecessarily...
+ curs.execute("UPDATE messages SET bodytxt=%(bodytxt)s WHERE id=%(id)s AND NOT (bodytxt=%(bodytxt)s) RETURNING id", {
+ 'id': pk,
+ 'bodytxt': self.bodytxt,
+ })
+ rc = curs.rowcount
+ curs.execute("UPDATE messages SET _from=%(from)s, _to=%(to)s, cc=%(cc)s, subject=%(subject)s, date=%(date)s, has_attachment=%(has_attachment)s WHERE id=%(id)s AND NOT (_from=%(from)s AND _to=%(to)s AND cc=%(cc)s AND subject=%(subject)s AND date=%(date)s AND has_attachment=%(has_attachment)s) RETURNING id", {
+ 'id': pk,
+ 'from': self._from,
+ 'to': self.to or '',
+ 'cc': self.cc or '',
+ 'subject': self.subject or '',
+ 'date': self.date,
+ 'has_attachment': len(self.attachments) > 0,
+ })
+ rc += curs.rowcount
+ if rc == 0:
+ log.status("Message %s unchanged" % self.msgid)
+ return False
- curs.execute("DELETE FROM attachments WHERE message=%(message)s", {
- 'message': pk,
- })
- if len(self.attachments):
- curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ {
- 'message': pk,
- 'filename': a[0] or 'unknown_filename',
- 'contenttype': a[1],
- 'attachment': bytearray(a[2]),
- } for a in self.attachments])
- opstatus.overwritten += 1
- log.status("Message %s overwritten" % self.msgid)
- else:
- log.status("Message %s already stored" % self.msgid)
- return True
+ curs.execute("DELETE FROM attachments WHERE message=%(message)s", {
+ 'message': pk,
+ })
+ if len(self.attachments):
+ curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ {
+ 'message': pk,
+ 'filename': a[0] or 'unknown_filename',
+ 'contenttype': a[1],
+ 'attachment': bytearray(a[2]),
+ } for a in self.attachments])
+ opstatus.overwritten += 1
+ log.status("Message %s overwritten" % self.msgid)
+ else:
+ log.status("Message %s already stored" % self.msgid)
+ return True
- if overwrite:
- raise Exception("Attempt to overwrite message (%s) that doesn't exist on list %s!" % (self.msgid, listid))
- # Always purge the primary list for this thread
- self.purge_list(listid, self.date.year, self.date.month)
+ if overwrite:
+ raise Exception("Attempt to overwrite message (%s) that doesn't exist on list %s!" % (self.msgid, listid))
+ # Always purge the primary list for this thread
+ self.purge_list(listid, self.date.year, self.date.month)
- # Resolve own thread
- curs.execute("SELECT id, messageid, threadid FROM messages WHERE messageid=ANY(%(parents)s)", {
- 'parents': self.parents,
- })
- all_parents = curs.fetchall()
- if len(all_parents):
- # At least one of the parents exist. Now try to figure out which one
- best_parent = len(self.parents)+1
- best_threadid = -1
- best_parentid = None
- for i in range(0,len(all_parents)):
- for j in range(0,len(self.parents)):
- if self.parents[j] == all_parents[i][1]:
- # This messageid found. Better than the last one?
- if j < best_parent:
- best_parent = j
- best_parentid = all_parents[i][0]
- best_threadid = all_parents[i][2]
- if best_threadid == -1:
- raise Exception("Message %s, resolve failed in a way it shouldn't :P" % selg.msgid)
- self.parentid = best_parentid
- self.threadid = best_threadid
- # Slice away all matches that are worse than the one we wanted
- self.parents = self.parents[:best_parent]
+ # Resolve own thread
+ curs.execute("SELECT id, messageid, threadid FROM messages WHERE messageid=ANY(%(parents)s)", {
+ 'parents': self.parents,
+ })
+ all_parents = curs.fetchall()
+ if len(all_parents):
+ # At least one of the parents exist. Now try to figure out which one
+ best_parent = len(self.parents)+1
+ best_threadid = -1
+ best_parentid = None
+ for i in range(0,len(all_parents)):
+ for j in range(0,len(self.parents)):
+ if self.parents[j] == all_parents[i][1]:
+ # This messageid found. Better than the last one?
+ if j < best_parent:
+ best_parent = j
+ best_parentid = all_parents[i][0]
+ best_threadid = all_parents[i][2]
+ if best_threadid == -1:
+ raise Exception("Message %s, resolve failed in a way it shouldn't :P" % selg.msgid)
+ self.parentid = best_parentid
+ self.threadid = best_threadid
+ # Slice away all matches that are worse than the one we wanted
+ self.parents = self.parents[:best_parent]
- log.status("Message %s resolved to existing thread %s, waiting for %s better messages" % (self.msgid, self.threadid, len(self.parents)))
- else:
- # No parent exist. But don't create the threadid just yet, since
- # it's possible that we're somebody elses parent!
- self.parentid = None
- self.threadid = None
+ log.status("Message %s resolved to existing thread %s, waiting for %s better messages" % (self.msgid, self.threadid, len(self.parents)))
+ else:
+ # No parent exist. But don't create the threadid just yet, since
+ # it's possible that we're somebody elses parent!
+ self.parentid = None
+ self.threadid = None
- # Now see if we are somebody elses *parent*...
- curs.execute("SELECT message, priority, threadid FROM unresolved_messages INNER JOIN messages ON messages.id=unresolved_messages.message WHERE unresolved_messages.msgid=%(msgid)s ORDER BY threadid", {
- 'msgid': self.msgid,
- })
- childrows = curs.fetchall()
- if len(childrows):
- # We are some already existing message's parent (meaning the
- # messages arrived out of order)
- # In the best case, the threadid is the same for all threads.
- # But it might be different if this it the "glue message" that's
- # holding other threads together.
- if self.threadid:
- # Already have a threadid, means that we have a glue message
- print("Message %s resolved to existing thread %s, while being somebodys parent" % (self.msgid, self.threadid))
- else:
- print("Message %s did not resolve to existing thread, but is somebodys parent" % self.msgid)
- # In this case, just pick the first thread from the list and merge into that
- # one.
- self.threadid = childrows[0][2]
+ # Now see if we are somebody elses *parent*...
+ curs.execute("SELECT message, priority, threadid FROM unresolved_messages INNER JOIN messages ON messages.id=unresolved_messages.message WHERE unresolved_messages.msgid=%(msgid)s ORDER BY threadid", {
+ 'msgid': self.msgid,
+ })
+ childrows = curs.fetchall()
+ if len(childrows):
+ # We are some already existing message's parent (meaning the
+ # messages arrived out of order)
+ # In the best case, the threadid is the same for all threads.
+ # But it might be different if this it the "glue message" that's
+ # holding other threads together.
+ if self.threadid:
+ # Already have a threadid, means that we have a glue message
+ print("Message %s resolved to existing thread %s, while being somebodys parent" % (self.msgid, self.threadid))
+ else:
+ print("Message %s did not resolve to existing thread, but is somebodys parent" % self.msgid)
+ # In this case, just pick the first thread from the list and merge into that
+ # one.
+ self.threadid = childrows[0][2]
- # Get a unique list (set) of all threads *except* the primary one,
- # because we'll be merging into that one.
- mergethreads = set([r[2] for r in childrows]).difference(set((self.threadid,)))
- if len(mergethreads):
- # We have one or more merge threads
- log.status("Merging threads %s into thread %s" % (",".join(str(s) for s in mergethreads), self.threadid))
- curs.execute("UPDATE messages SET threadid=%(threadid)s WHERE threadid=ANY(%(oldthreadids)s)", {
- 'threadid': self.threadid,
- 'oldthreadids': list(mergethreads),
- })
- # Insert any lists that were tagged on the merged threads
- curs.execute("INSERT INTO list_threads (threadid, listid) SELECT DISTINCT %(threadid)s,listid FROM list_threads lt2 WHERE lt2.threadid=ANY(%(oldthreadids)s) AND listid NOT IN (SELECT listid FROM list_threads lt3 WHERE lt3.threadid=%(threadid)s)", {
- 'threadid': self.threadid,
- 'oldthreadids': list(mergethreads),
- })
- # Remove all old leftovers
- curs.execute("DELETE FROM list_threads WHERE threadid=ANY(%(oldthreadids)s)", {
- 'oldthreadids': list(mergethreads),
- })
- # Purge varnish records for all the threads we just removed
- for t in mergethreads:
- self.purge_thread(t)
+ # Get a unique list (set) of all threads *except* the primary one,
+ # because we'll be merging into that one.
+ mergethreads = set([r[2] for r in childrows]).difference(set((self.threadid,)))
+ if len(mergethreads):
+ # We have one or more merge threads
+ log.status("Merging threads %s into thread %s" % (",".join(str(s) for s in mergethreads), self.threadid))
+ curs.execute("UPDATE messages SET threadid=%(threadid)s WHERE threadid=ANY(%(oldthreadids)s)", {
+ 'threadid': self.threadid,
+ 'oldthreadids': list(mergethreads),
+ })
+ # Insert any lists that were tagged on the merged threads
+ curs.execute("INSERT INTO list_threads (threadid, listid) SELECT DISTINCT %(threadid)s,listid FROM list_threads lt2 WHERE lt2.threadid=ANY(%(oldthreadids)s) AND listid NOT IN (SELECT listid FROM list_threads lt3 WHERE lt3.threadid=%(threadid)s)", {
+ 'threadid': self.threadid,
+ 'oldthreadids': list(mergethreads),
+ })
+ # Remove all old leftovers
+ curs.execute("DELETE FROM list_threads WHERE threadid=ANY(%(oldthreadids)s)", {
+ 'oldthreadids': list(mergethreads),
+ })
+ # Purge varnish records for all the threads we just removed
+ for t in mergethreads:
+ self.purge_thread(t)
- # Batch all the children for repointing. We can't do the actual
- # repointing until later, since we don't know our own id yet.
- self.children = [r[0] for r in childrows]
- log.status("Children set to %s with mergethreads being %s (from childrows %s and threadid %s)" % (
- self.children, mergethreads, childrows, self.threadid))
+ # Batch all the children for repointing. We can't do the actual
+ # repointing until later, since we don't know our own id yet.
+ self.children = [r[0] for r in childrows]
+ log.status("Children set to %s with mergethreads being %s (from childrows %s and threadid %s)" % (
+ self.children, mergethreads, childrows, self.threadid))
- # Finally, remove all the pending messages that had a higher
- # priority value (meaning less important) than us
- curs.executemany("DELETE FROM unresolved_messages WHERE message=%(msg)s AND priority >= %(prio)s", [{
- 'msg': msg,
- 'prio': prio,
- } for msg, prio, tid in childrows])
- else:
- self.children = []
+ # Finally, remove all the pending messages that had a higher
+ # priority value (meaning less important) than us
+ curs.executemany("DELETE FROM unresolved_messages WHERE message=%(msg)s AND priority >= %(prio)s", [{
+ 'msg': msg,
+ 'prio': prio,
+ } for msg, prio, tid in childrows])
+ else:
+ self.children = []
- if not self.threadid:
- # No parent and no child exists - create a new threadid, just for us!
- curs.execute("SELECT nextval('threadid_seq')")
- self.threadid = curs.fetchall()[0][0]
- log.status("Message %s resolved to no parent (out of %s) and no child, new thread %s" % (self.msgid, len(self.parents), self.threadid))
- else:
- # We have a threadid already, so we're not a new thread. Thus,
- # we need to purge the old thread
- self.purge_thread(self.threadid)
+ if not self.threadid:
+ # No parent and no child exists - create a new threadid, just for us!
+ curs.execute("SELECT nextval('threadid_seq')")
+ self.threadid = curs.fetchall()[0][0]
+ log.status("Message %s resolved to no parent (out of %s) and no child, new thread %s" % (self.msgid, len(self.parents), self.threadid))
+ else:
+ # We have a threadid already, so we're not a new thread. Thus,
+ # we need to purge the old thread
+ self.purge_thread(self.threadid)
- # Insert a thread tag if we're on a new list
- curs.execute("INSERT INTO list_threads (threadid, listid) SELECT %(threadid)s, %(listid)s WHERE NOT EXISTS (SELECT * FROM list_threads t2 WHERE t2.threadid=%(threadid)s AND t2.listid=%(listid)s) RETURNING threadid", {
- 'threadid': self.threadid,
- 'listid': listid,
- })
- if len(curs.fetchall()):
- log.status("Tagged thread %s with listid %s" % (self.threadid, listid))
+ # Insert a thread tag if we're on a new list
+ curs.execute("INSERT INTO list_threads (threadid, listid) SELECT %(threadid)s, %(listid)s WHERE NOT EXISTS (SELECT * FROM list_threads t2 WHERE t2.threadid=%(threadid)s AND t2.listid=%(listid)s) RETURNING threadid", {
+ 'threadid': self.threadid,
+ 'listid': listid,
+ })
+ if len(curs.fetchall()):
+ log.status("Tagged thread %s with listid %s" % (self.threadid, listid))
- curs.execute("INSERT INTO messages (parentid, threadid, _from, _to, cc, subject, date, has_attachment, messageid, bodytxt, rawtxt) VALUES (%(parentid)s, %(threadid)s, %(from)s, %(to)s, %(cc)s, %(subject)s, %(date)s, %(has_attachment)s, %(messageid)s, %(bodytxt)s, %(rawtxt)s) RETURNING id", {
- 'parentid': self.parentid,
- 'threadid': self.threadid,
- 'from': self._from,
- 'to': self.to or '',
- 'cc': self.cc or '',
- 'subject': self.subject or '',
- 'date': self.date,
- 'has_attachment': len(self.attachments) > 0,
- 'messageid': self.msgid,
- 'bodytxt': self.bodytxt,
- 'rawtxt': bytearray(self.rawtxt),
- })
- id = curs.fetchall()[0][0]
- log.status("Message %s, got id %s, set thread %s, parent %s" % (
- self.msgid, id, self.threadid, self.parentid))
- if len(self.attachments):
- # Insert attachments
- curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ {
- 'message': id,
- 'filename': a[0] or 'unknown_filename',
- 'contenttype': a[1],
- 'attachment': bytearray(a[2]),
- } for a in self.attachments])
+ curs.execute("INSERT INTO messages (parentid, threadid, _from, _to, cc, subject, date, has_attachment, messageid, bodytxt, rawtxt) VALUES (%(parentid)s, %(threadid)s, %(from)s, %(to)s, %(cc)s, %(subject)s, %(date)s, %(has_attachment)s, %(messageid)s, %(bodytxt)s, %(rawtxt)s) RETURNING id", {
+ 'parentid': self.parentid,
+ 'threadid': self.threadid,
+ 'from': self._from,
+ 'to': self.to or '',
+ 'cc': self.cc or '',
+ 'subject': self.subject or '',
+ 'date': self.date,
+ 'has_attachment': len(self.attachments) > 0,
+ 'messageid': self.msgid,
+ 'bodytxt': self.bodytxt,
+ 'rawtxt': bytearray(self.rawtxt),
+ })
+ id = curs.fetchall()[0][0]
+ log.status("Message %s, got id %s, set thread %s, parent %s" % (
+ self.msgid, id, self.threadid, self.parentid))
+ if len(self.attachments):
+ # Insert attachments
+ curs.executemany("INSERT INTO attachments (message, filename, contenttype, attachment) VALUES (%(message)s, %(filename)s, %(contenttype)s, %(attachment)s)",[ {
+ 'message': id,
+ 'filename': a[0] or 'unknown_filename',
+ 'contenttype': a[1],
+ 'attachment': bytearray(a[2]),
+ } for a in self.attachments])
- if len(self.children):
- log.status("Setting %s other messages to children of %s" % (len(self.children), self.msgid))
- curs.executemany("UPDATE messages SET parentid=%(parent)s WHERE id=%(id)s",
- [{'parent': id, 'id': c} for c in self.children])
- if len(self.parents):
- # There are remaining parents we'd rather have to get ourselves
- # properly threaded - so store them in the db.
- curs.executemany("INSERT INTO unresolved_messages (message, priority, msgid) VALUES (%(id)s, %(priority)s, %(msgid)s)",
- [{'id': id, 'priority': i, 'msgid': self.parents[i]} for i in range(0, len(self.parents))])
+ if len(self.children):
+ log.status("Setting %s other messages to children of %s" % (len(self.children), self.msgid))
+ curs.executemany("UPDATE messages SET parentid=%(parent)s WHERE id=%(id)s",
+ [{'parent': id, 'id': c} for c in self.children])
+ if len(self.parents):
+ # There are remaining parents we'd rather have to get ourselves
+ # properly threaded - so store them in the db.
+ curs.executemany("INSERT INTO unresolved_messages (message, priority, msgid) VALUES (%(id)s, %(priority)s, %(msgid)s)",
+ [{'id': id, 'priority': i, 'msgid': self.parents[i]} for i in range(0, len(self.parents))])
- opstatus.stored += 1
- return True
+ opstatus.stored += 1
+ return True
- def diff(self, conn, f, fromonlyf, oldid):
- curs = conn.cursor()
+ def diff(self, conn, f, fromonlyf, oldid):
+ curs = conn.cursor()
- # Fetch the old one so we have something to diff against
- curs.execute("SELECT id, _from, _to, cc, subject, date, has_attachment, bodytxt FROM messages WHERE messageid=%(msgid)s", {
- 'msgid': self.msgid,
- })
- try:
- id, _from, to, cc, subject, date, has_attachment, bodytxt = curs.fetchone()
- except TypeError as e:
- f.write("---- %s ----\n" % self.msgid)
- f.write("Could not re-find in archives (old id was %s): %s\n" % (oldid, e))
- f.write("\n-------------------------------\n\n")
- return
+ # Fetch the old one so we have something to diff against
+ curs.execute("SELECT id, _from, _to, cc, subject, date, has_attachment, bodytxt FROM messages WHERE messageid=%(msgid)s", {
+ 'msgid': self.msgid,
+ })
+ try:
+ id, _from, to, cc, subject, date, has_attachment, bodytxt = curs.fetchone()
+ except TypeError as e:
+ f.write("---- %s ----\n" % self.msgid)
+ f.write("Could not re-find in archives (old id was %s): %s\n" % (oldid, e))
+ f.write("\n-------------------------------\n\n")
+ return
- if (_from.rstrip(), to.rstrip(), cc.rstrip(), subject.rstrip()) != (self._from, self.to, self.cc, self.subject):
- log.status("Message %s has header changes " % self.msgid)
- f.write("==== %s ====\n" % self.msgid)
- for fn in ['_from', 'to', 'cc', 'subject']:
- if getattr(self, fn) != eval(fn):
- s = "- {0}: {1}\n".format(fn, eval(fn))
- d = "+ {0}: {1}\n".format(fn, getattr(self, fn))
- f.write(s)
- f.write(d)
- f.write("\n\n")
+ if (_from.rstrip(), to.rstrip(), cc.rstrip(), subject.rstrip()) != (self._from, self.to, self.cc, self.subject):
+ log.status("Message %s has header changes " % self.msgid)
+ f.write("==== %s ====\n" % self.msgid)
+ for fn in ['_from', 'to', 'cc', 'subject']:
+ if getattr(self, fn) != eval(fn):
+ s = "- {0}: {1}\n".format(fn, eval(fn))
+ d = "+ {0}: {1}\n".format(fn, getattr(self, fn))
+ f.write(s)
+ f.write(d)
+ f.write("\n\n")
- if bodytxt != self.bodytxt:
- log.status("Message %s has body changes " % self.msgid)
- tempdiff = list(difflib.unified_diff(bodytxt.splitlines(),
- self.bodytxt.splitlines(),
- fromfile='old',
- tofile='new',
- n=0,
- lineterm=''))
- if (len(tempdiff)-2) % 3 == 0:
- # 3 rows to a diff, two header rows.
- # Then verify that each slice of 3 contains one @@ row (header), one -From and one +>From,
- # which indicates the only change is in the From.
- ok = True
- tempdiff = tempdiff[2:]
- while tempdiff:
- a,b,c = (tempdiff.pop(0), tempdiff.pop(0), tempdiff.pop(0))
- if not (a.startswith('@@ ') and b.startswith('-From ') and c.startswith('+>From ')):
- ok=False
- break
- if ok:
- fromonlyf.write("%s\n" % self.msgid)
- return
+ if bodytxt != self.bodytxt:
+ log.status("Message %s has body changes " % self.msgid)
+ tempdiff = list(difflib.unified_diff(bodytxt.splitlines(),
+ self.bodytxt.splitlines(),
+ fromfile='old',
+ tofile='new',
+ n=0,
+ lineterm=''))
+ if (len(tempdiff)-2) % 3 == 0:
+ # 3 rows to a diff, two header rows.
+ # Then verify that each slice of 3 contains one @@ row (header), one -From and one +>From,
+ # which indicates the only change is in the From.
+ ok = True
+ tempdiff = tempdiff[2:]
+ while tempdiff:
+ a,b,c = (tempdiff.pop(0), tempdiff.pop(0), tempdiff.pop(0))
+ if not (a.startswith('@@ ') and b.startswith('-From ') and c.startswith('+>From ')):
+ ok=False
+ break
+ if ok:
+ fromonlyf.write("%s\n" % self.msgid)
+ return
- # Generate a nicer diff
- d = list(difflib.unified_diff(bodytxt.splitlines(),
- self.bodytxt.splitlines(),
- fromfile='old',
- tofile='new',
- n=0,
- lineterm=''))
- if len(d) > 0:
- f.write("---- %s ----\n" % self.msgid)
- f.write("\n".join(d))
- f.write("\n\n")
- else:
- log.status("Message %s unchanged." % self.msgid)
+ # Generate a nicer diff
+ d = list(difflib.unified_diff(bodytxt.splitlines(),
+ self.bodytxt.splitlines(),
+ fromfile='old',
+ tofile='new',
+ n=0,
+ lineterm=''))
+ if len(d) > 0:
+ f.write("---- %s ----\n" % self.msgid)
+ f.write("\n".join(d))
+ f.write("\n\n")
+ else:
+ log.status("Message %s unchanged." % self.msgid)
from lib.log import log
class VarnishPurger(object):
- def __init__(self, cfg):
- self.cfg = cfg
+ def __init__(self, cfg):
+ self.cfg = cfg
- def purge(self, purges):
- if not len(purges):
- return
+ def purge(self, purges):
+ if not len(purges):
+ return
- if not self.cfg.has_option('varnish', 'purgeurl'):
- return
+ if not self.cfg.has_option('varnish', 'purgeurl'):
+ return
- purgeurl = self.cfg.get('varnish', 'purgeurl')
- exprlist = []
- for p in purges:
- if isinstance(p, tuple):
- # Purging a list
- exprlist.append('obj.http.x-pglm ~ :%s/%s/%s:' % p)
- else:
- # Purging individual thread
- exprlist.append('obj.http.x-pgthread ~ :%s:' % p)
- purgedict = dict(list(zip(['p%s' % n for n in range(0, len(exprlist))], exprlist)))
- purgedict['n'] = len(exprlist)
- r = requests.post(purgeurl, data=purgedict, headers={
- 'Content-type': 'application/x-www-form-urlencoded',
- 'Host': 'www.postgresql.org',
- })
- if r.status_code != 200:
- log.error("Failed to send purge request!")
+ purgeurl = self.cfg.get('varnish', 'purgeurl')
+ exprlist = []
+ for p in purges:
+ if isinstance(p, tuple):
+ # Purging a list
+ exprlist.append('obj.http.x-pglm ~ :%s/%s/%s:' % p)
+ else:
+ # Purging individual thread
+ exprlist.append('obj.http.x-pgthread ~ :%s:' % p)
+ purgedict = dict(list(zip(['p%s' % n for n in range(0, len(exprlist))], exprlist)))
+ purgedict['n'] = len(exprlist)
+ r = requests.post(purgeurl, data=purgedict, headers={
+ 'Content-type': 'application/x-www-form-urlencoded',
+ 'Host': 'www.postgresql.org',
+ })
+ if r.status_code != 200:
+ log.error("Failed to send purge request!")
from lib.varnish import VarnishPurger
def log_failed_message(listid, srctype, src, msg, err):
- try:
- msgid = msg.msgid
- except:
- msgid = "<unknown>"
- log.error("Failed to load message (msgid %s) from %s, spec %s: %s" % (msgid.encode('us-ascii', 'replace'), srctype, src, str(str(err), 'us-ascii', 'replace')))
-
- # We also put the data in the db. This happens in the main transaction
- # so if the whole script dies, it goes away...
- conn.cursor().execute("INSERT INTO loaderrors (listid, msgid, srctype, src, err) VALUES (%(listid)s, %(msgid)s, %(srctype)s, %(src)s, %(err)s)", {
- 'listid': listid,
- 'msgid': msgid,
- 'srctype': srctype,
- 'src': src,
- 'err': str(str(err), 'us-ascii', 'replace'),
- })
+ try:
+ msgid = msg.msgid
+ except:
+ msgid = "<unknown>"
+ log.error("Failed to load message (msgid %s) from %s, spec %s: %s" % (msgid.encode('us-ascii', 'replace'), srctype, src, str(str(err), 'us-ascii', 'replace')))
+
+ # We also put the data in the db. This happens in the main transaction
+ # so if the whole script dies, it goes away...
+ conn.cursor().execute("INSERT INTO loaderrors (listid, msgid, srctype, src, err) VALUES (%(listid)s, %(msgid)s, %(srctype)s, %(src)s, %(err)s)", {
+ 'listid': listid,
+ 'msgid': msgid,
+ 'srctype': srctype,
+ 'src': src,
+ 'err': str(str(err), 'us-ascii', 'replace'),
+ })
if __name__ == "__main__":
- optparser = OptionParser()
- optparser.add_option('-l', '--list', dest='list', help='Name of list to load message for')
- optparser.add_option('-d', '--directory', dest='directory', help='Load all messages in directory')
- optparser.add_option('-m', '--mbox', dest='mbox', help='Load all messages in mbox')
- optparser.add_option('-i', '--interactive', dest='interactive', action='store_true', help='Prompt after each message')
- optparser.add_option('-v', '--verbose', dest='verbose', action='store_true', help='Verbose output')
- optparser.add_option('--force-date', dest='force_date', help='Override date (used for dates that can\'t be parsed)')
- optparser.add_option('--filter-msgid', dest='filter_msgid', help='Only process message with given msgid')
-
- (opt, args) = optparser.parse_args()
-
- if (len(args)):
- print("No bare arguments accepted")
- optparser.print_usage()
- sys.exit(1)
-
- if not opt.list:
- print("List must be specified")
- optparser.print_usage()
- sys.exit(1)
-
- if opt.directory and opt.mbox:
- print("Can't specify both directory and mbox!")
- optparser.print_usage()
- sys.exit(1)
-
- if opt.force_date and (opt.directory or opt.mbox) and not opt.filter_msgid:
- print("Can't use force_date with directory or mbox - only individual messages")
- optparser.print_usage()
- sys.exit(1)
-
- if opt.filter_msgid and not (opt.directory or opt.mbox):
- print("filter_msgid makes no sense without directory or mbox!")
- optparser.print_usage()
- sys.exit(1)
-
- log.set(opt.verbose)
-
- cfg = ConfigParser()
- cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
- try:
- connstr = cfg.get('db','connstr')
- except:
- connstr = 'need_connstr'
-
- conn = psycopg2.connect(connstr)
- curs = conn.cursor()
-
- # Take an advisory lock to force serialization.
- # We could do this "properly" by reordering operations and using ON CONFLICT,
- # but concurrency is not that important and this is easier...
- try:
- curs.execute("SET statement_timeout='30s'")
- curs.execute("SELECT pg_advisory_xact_lock(8059944559669076)")
- except Exception as e:
- print(("Failed to wait on advisory lock: %s" % e))
- sys.exit(1)
-
- # Get the listid we're working on
- curs.execute("SELECT listid FROM lists WHERE listname=%(list)s", {
- 'list': opt.list
- })
- r = curs.fetchall()
- if len(r) != 1:
- log.error("List %s not found" % opt.list)
- conn.close()
- sys.exit(1)
- listid = r[0][0]
-
- purges = set()
-
- if opt.directory:
- # Parse all files in directory
- for x in os.listdir(opt.directory):
- log.status("Parsing file %s" % x)
- with open(os.path.join(opt.directory, x)) as f:
- ap = ArchivesParserStorage()
- ap.parse(f)
- if opt.filter_msgid and not ap.is_msgid(opt.filter_msgid):
- continue
- try:
- ap.analyze(date_override=opt.force_date)
- except IgnorableException as e:
- log_failed_message(listid, "directory", os.path.join(opt.directory, x), ap, e)
- opstatus.failed += 1
- continue
- ap.store(conn, listid)
- purges.update(ap.purges)
- if opt.interactive:
- print("Interactive mode, committing transaction")
- conn.commit()
- print("Proceed to next message with Enter, or input a period (.) to stop processing")
- x = input()
- if x == '.':
- print("Ok, aborting!")
- break
- print("---------------------------------")
- elif opt.mbox:
- if not os.path.isfile(opt.mbox):
- print("File %s does not exist" % opt.mbox)
- sys.exit(1)
- mboxparser = MailboxBreakupParser(opt.mbox)
- while not mboxparser.EOF:
- ap = ArchivesParserStorage()
- msg = next(mboxparser)
- if not msg:
- break
- ap.parse(msg)
- if opt.filter_msgid and not ap.is_msgid(opt.filter_msgid):
- continue
- try:
- ap.analyze(date_override=opt.force_date)
- except IgnorableException as e:
- log_failed_message(listid, "mbox", opt.mbox, ap, e)
- opstatus.failed += 1
- continue
- ap.store(conn, listid)
- purges.update(ap.purges)
- if mboxparser.returncode():
- log.error("Failed to parse mbox:")
- log.error(mboxparser.stderr_output())
- sys.exit(1)
- else:
- # Parse single message on stdin
- ap = ArchivesParserStorage()
- ap.parse(sys.stdin.buffer)
- try:
- ap.analyze(date_override=opt.force_date)
- except IgnorableException as e:
- log_failed_message(listid, "stdin","", ap, e)
- conn.close()
- sys.exit(1)
- ap.store(conn, listid)
- purges.update(ap.purges)
- if opstatus.stored:
- log.log("Stored message with message-id %s" % ap.msgid)
-
- conn.commit()
- conn.close()
- opstatus.print_status()
-
- VarnishPurger(cfg).purge(purges)
+ optparser = OptionParser()
+ optparser.add_option('-l', '--list', dest='list', help='Name of list to load message for')
+ optparser.add_option('-d', '--directory', dest='directory', help='Load all messages in directory')
+ optparser.add_option('-m', '--mbox', dest='mbox', help='Load all messages in mbox')
+ optparser.add_option('-i', '--interactive', dest='interactive', action='store_true', help='Prompt after each message')
+ optparser.add_option('-v', '--verbose', dest='verbose', action='store_true', help='Verbose output')
+ optparser.add_option('--force-date', dest='force_date', help='Override date (used for dates that can\'t be parsed)')
+ optparser.add_option('--filter-msgid', dest='filter_msgid', help='Only process message with given msgid')
+
+ (opt, args) = optparser.parse_args()
+
+ if (len(args)):
+ print("No bare arguments accepted")
+ optparser.print_usage()
+ sys.exit(1)
+
+ if not opt.list:
+ print("List must be specified")
+ optparser.print_usage()
+ sys.exit(1)
+
+ if opt.directory and opt.mbox:
+ print("Can't specify both directory and mbox!")
+ optparser.print_usage()
+ sys.exit(1)
+
+ if opt.force_date and (opt.directory or opt.mbox) and not opt.filter_msgid:
+ print("Can't use force_date with directory or mbox - only individual messages")
+ optparser.print_usage()
+ sys.exit(1)
+
+ if opt.filter_msgid and not (opt.directory or opt.mbox):
+ print("filter_msgid makes no sense without directory or mbox!")
+ optparser.print_usage()
+ sys.exit(1)
+
+ log.set(opt.verbose)
+
+ cfg = ConfigParser()
+ cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
+ try:
+ connstr = cfg.get('db','connstr')
+ except:
+ connstr = 'need_connstr'
+
+ conn = psycopg2.connect(connstr)
+ curs = conn.cursor()
+
+ # Take an advisory lock to force serialization.
+ # We could do this "properly" by reordering operations and using ON CONFLICT,
+ # but concurrency is not that important and this is easier...
+ try:
+ curs.execute("SET statement_timeout='30s'")
+ curs.execute("SELECT pg_advisory_xact_lock(8059944559669076)")
+ except Exception as e:
+ print(("Failed to wait on advisory lock: %s" % e))
+ sys.exit(1)
+
+ # Get the listid we're working on
+ curs.execute("SELECT listid FROM lists WHERE listname=%(list)s", {
+ 'list': opt.list
+ })
+ r = curs.fetchall()
+ if len(r) != 1:
+ log.error("List %s not found" % opt.list)
+ conn.close()
+ sys.exit(1)
+ listid = r[0][0]
+
+ purges = set()
+
+ if opt.directory:
+ # Parse all files in directory
+ for x in os.listdir(opt.directory):
+ log.status("Parsing file %s" % x)
+ with open(os.path.join(opt.directory, x)) as f:
+ ap = ArchivesParserStorage()
+ ap.parse(f)
+ if opt.filter_msgid and not ap.is_msgid(opt.filter_msgid):
+ continue
+ try:
+ ap.analyze(date_override=opt.force_date)
+ except IgnorableException as e:
+ log_failed_message(listid, "directory", os.path.join(opt.directory, x), ap, e)
+ opstatus.failed += 1
+ continue
+ ap.store(conn, listid)
+ purges.update(ap.purges)
+ if opt.interactive:
+ print("Interactive mode, committing transaction")
+ conn.commit()
+ print("Proceed to next message with Enter, or input a period (.) to stop processing")
+ x = input()
+ if x == '.':
+ print("Ok, aborting!")
+ break
+ print("---------------------------------")
+ elif opt.mbox:
+ if not os.path.isfile(opt.mbox):
+ print("File %s does not exist" % opt.mbox)
+ sys.exit(1)
+ mboxparser = MailboxBreakupParser(opt.mbox)
+ while not mboxparser.EOF:
+ ap = ArchivesParserStorage()
+ msg = next(mboxparser)
+ if not msg:
+ break
+ ap.parse(msg)
+ if opt.filter_msgid and not ap.is_msgid(opt.filter_msgid):
+ continue
+ try:
+ ap.analyze(date_override=opt.force_date)
+ except IgnorableException as e:
+ log_failed_message(listid, "mbox", opt.mbox, ap, e)
+ opstatus.failed += 1
+ continue
+ ap.store(conn, listid)
+ purges.update(ap.purges)
+ if mboxparser.returncode():
+ log.error("Failed to parse mbox:")
+ log.error(mboxparser.stderr_output())
+ sys.exit(1)
+ else:
+ # Parse single message on stdin
+ ap = ArchivesParserStorage()
+ ap.parse(sys.stdin.buffer)
+ try:
+ ap.analyze(date_override=opt.force_date)
+ except IgnorableException as e:
+ log_failed_message(listid, "stdin","", ap, e)
+ conn.close()
+ sys.exit(1)
+ ap.store(conn, listid)
+ purges.update(ap.purges)
+ if opstatus.stored:
+ log.log("Stored message with message-id %s" % ap.msgid)
+
+ conn.commit()
+ conn.close()
+ opstatus.print_status()
+
+ VarnishPurger(cfg).purge(purges)
import requests
if __name__=="__main__":
- parser = argparse.ArgumentParser(description="Synchronize lists from pglister")
- parser.add_argument('--dryrun', action='store_true', help="Don't commit changes to database")
+ parser = argparse.ArgumentParser(description="Synchronize lists from pglister")
+ parser.add_argument('--dryrun', action='store_true', help="Don't commit changes to database")
- args = parser.parse_args()
+ args = parser.parse_args()
- cfg = ConfigParser()
- cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
- try:
- connstr = cfg.get('db','connstr')
- except:
- connstr = 'need_connstr'
+ cfg = ConfigParser()
+ cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
+ try:
+ connstr = cfg.get('db','connstr')
+ except:
+ connstr = 'need_connstr'
- if cfg.has_option('pglister', 'subscribers') and cfg.getint('pglister', 'subscribers'):
- do_subscribers=1
- else:
- do_subscribers=0
+ if cfg.has_option('pglister', 'subscribers') and cfg.getint('pglister', 'subscribers'):
+ do_subscribers=1
+ else:
+ do_subscribers=0
- psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
- conn = psycopg2.connect(connstr)
- curs = conn.cursor()
+ psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
+ conn = psycopg2.connect(connstr)
+ curs = conn.cursor()
- r = requests.get('{0}/api/archive/{1}/lists/?subscribers={2}'.format(
- cfg.get('pglister', 'root'),
- cfg.get('pglister', 'myname'),
- do_subscribers and 1 or 0,
- ), headers={
- 'X-Api-Key': cfg.get('pglister', 'apikey'),
- })
- obj = r.json()
+ r = requests.get('{0}/api/archive/{1}/lists/?subscribers={2}'.format(
+ cfg.get('pglister', 'root'),
+ cfg.get('pglister', 'myname'),
+ do_subscribers and 1 or 0,
+ ), headers={
+ 'X-Api-Key': cfg.get('pglister', 'apikey'),
+ })
+ obj = r.json()
- # For groups, just add them if they don't exist
- groups = {g['group']['id']:g['group']['groupname'] for g in obj}
+ # For groups, just add them if they don't exist
+ groups = {g['group']['id']:g['group']['groupname'] for g in obj}
- for id,name in list(groups.items()):
- curs.execute("SELECT EXISTS (SELECT 1 FROM listgroups WHERE groupname=%(group)s)", {
- 'group': name,
- })
- if not curs.fetchone()[0]:
- curs.execute("INSERT INTO listgroups (groupname, sortkey) VALUES (%(group)s, 100) RETURNING groupname", {
- 'group': name,
- })
- print("Added group %s" % name)
+ for id,name in list(groups.items()):
+ curs.execute("SELECT EXISTS (SELECT 1 FROM listgroups WHERE groupname=%(group)s)", {
+ 'group': name,
+ })
+ if not curs.fetchone()[0]:
+ curs.execute("INSERT INTO listgroups (groupname, sortkey) VALUES (%(group)s, 100) RETURNING groupname", {
+ 'group': name,
+ })
+ print("Added group %s" % name)
- # Add any missing lists, and synchronize their contents.
- for l in obj:
- curs.execute("SELECT listid,listname FROM lists WHERE listname=%(name)s", {
- 'name': l['listname'],
- })
- if curs.rowcount == 0:
- curs.execute("INSERT INTO lists (listname, shortdesc, description, active, groupid) SELECT %(name)s, %(name)s, %(desc)s, 't', groupid FROM listgroups WHERE groupname=%(groupname)s RETURNING listid, listname", {
- 'name': l['listname'],
- 'desc': l['longdesc'],
- 'groupname': l['group']['groupname'],
- })
- listid, name = curs.fetchone()
- print("Added list %s" % name)
- else:
- listid, name = curs.fetchone()
- curs.execute("UPDATE lists SET shortdesc=%(name)s, description=%(desc)s, groupid=(SELECT groupid FROM listgroups WHERE groupname=%(groupname)s), active=true WHERE listid=%(id)s AND NOT (active AND shortdesc=%(name)s AND description=%(desc)s AND groupid=(SELECT groupid FROM listgroups WHERE groupname=%(groupname)s)) RETURNING listname", {
- 'id': listid,
- 'name': l['listname'],
- 'desc': l['longdesc'],
- 'groupname': l['group']['groupname'],
- })
- for n, in curs.fetchall():
- print("Updated list %s " % n)
+ # Add any missing lists, and synchronize their contents.
+ for l in obj:
+ curs.execute("SELECT listid,listname FROM lists WHERE listname=%(name)s", {
+ 'name': l['listname'],
+ })
+ if curs.rowcount == 0:
+ curs.execute("INSERT INTO lists (listname, shortdesc, description, active, groupid) SELECT %(name)s, %(name)s, %(desc)s, 't', groupid FROM listgroups WHERE groupname=%(groupname)s RETURNING listid, listname", {
+ 'name': l['listname'],
+ 'desc': l['longdesc'],
+ 'groupname': l['group']['groupname'],
+ })
+ listid, name = curs.fetchone()
+ print("Added list %s" % name)
+ else:
+ listid, name = curs.fetchone()
+ curs.execute("UPDATE lists SET shortdesc=%(name)s, description=%(desc)s, groupid=(SELECT groupid FROM listgroups WHERE groupname=%(groupname)s), active=true WHERE listid=%(id)s AND NOT (active AND shortdesc=%(name)s AND description=%(desc)s AND groupid=(SELECT groupid FROM listgroups WHERE groupname=%(groupname)s)) RETURNING listname", {
+ 'id': listid,
+ 'name': l['listname'],
+ 'desc': l['longdesc'],
+ 'groupname': l['group']['groupname'],
+ })
+ for n, in curs.fetchall():
+ print("Updated list %s " % n)
- if do_subscribers:
- # If we synchronize subscribers, we do so on all lists for now.
- curs.execute("WITH t(u) AS (SELECT UNNEST(%(usernames)s)), ins(un) AS (INSERT INTO listsubscribers (username, list_id) SELECT u, %(listid)s FROM t WHERE NOT EXISTS (SELECT 1 FROM listsubscribers WHERE username=u AND list_id=%(listid)s) RETURNING username), del(un) AS (DELETE FROM listsubscribers WHERE list_id=%(listid)s AND NOT EXISTS (SELECT 1 FROM t WHERE u=username) RETURNING username) SELECT 'ins',un FROM ins UNION ALL SELECT 'del',un FROM del ORDER BY 1,2", {
- 'usernames': l['subscribers'],
- 'listid': listid,
- })
- for what, who in curs.fetchall():
- if what == 'ins':
- print("Added subscriber %s to list %s" % (who, name))
- else:
- print("Removed subscriber %s from list %s" % (who, name))
+ if do_subscribers:
+ # If we synchronize subscribers, we do so on all lists for now.
+ curs.execute("WITH t(u) AS (SELECT UNNEST(%(usernames)s)), ins(un) AS (INSERT INTO listsubscribers (username, list_id) SELECT u, %(listid)s FROM t WHERE NOT EXISTS (SELECT 1 FROM listsubscribers WHERE username=u AND list_id=%(listid)s) RETURNING username), del(un) AS (DELETE FROM listsubscribers WHERE list_id=%(listid)s AND NOT EXISTS (SELECT 1 FROM t WHERE u=username) RETURNING username) SELECT 'ins',un FROM ins UNION ALL SELECT 'del',un FROM del ORDER BY 1,2", {
+ 'usernames': l['subscribers'],
+ 'listid': listid,
+ })
+ for what, who in curs.fetchall():
+ if what == 'ins':
+ print("Added subscriber %s to list %s" % (who, name))
+ else:
+ print("Removed subscriber %s from list %s" % (who, name))
- # We don't remove lists ever, because we probably want to keep archives around.
- # But for now, we alert on them.
- curs.execute("SELECT listname FROM lists WHERE active AND NOT listname=ANY(%(lists)s)", {
- 'lists': [l['listname'] for l in obj],
- })
- for n, in curs.fetchall():
- print("List %s exists in archives, but not in upstream! Should it be marked inactive?" % n)
+ # We don't remove lists ever, because we probably want to keep archives around.
+ # But for now, we alert on them.
+ curs.execute("SELECT listname FROM lists WHERE active AND NOT listname=ANY(%(lists)s)", {
+ 'lists': [l['listname'] for l in obj],
+ })
+ for n, in curs.fetchall():
+ print("List %s exists in archives, but not in upstream! Should it be marked inactive?" % n)
- if args.dryrun:
- print("Dry-run, rolling back")
- conn.rollback()
- else:
- conn.commit()
- conn.close()
+ if args.dryrun:
+ print("Dry-run, rolling back")
+ conn.rollback()
+ else:
+ conn.commit()
+ conn.close()
from lib.varnish import VarnishPurger
if __name__ == "__main__":
- optparser = OptionParser()
- optparser.add_option('-m', '--msgid', dest='msgid', help='Messageid to load')
-
- (opt, args) = optparser.parse_args()
-
- if (len(args)):
- print("No bare arguments accepted")
- optparser.print_help()
- sys.exit(1)
-
- if not opt.msgid:
- print("Message-id must be specified")
- optparser.print_help()
- sys.exit(1)
-
- cfg = ConfigParser()
- cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
- try:
- connstr = cfg.get('db','connstr')
- except:
- connstr = 'need_connstr'
-
- conn = psycopg2.connect(connstr)
- curs = conn.cursor()
-
- curs.execute("SELECT id, threadid FROM messages WHERE messageid=%(msgid)s", {
- 'msgid': opt.msgid,
- })
- id, threadid = curs.fetchone()
-
- VarnishPurger(cfg).purge([int(threadid), ])
- conn.close()
+ optparser = OptionParser()
+ optparser.add_option('-m', '--msgid', dest='msgid', help='Messageid to load')
+
+ (opt, args) = optparser.parse_args()
+
+ if (len(args)):
+ print("No bare arguments accepted")
+ optparser.print_help()
+ sys.exit(1)
+
+ if not opt.msgid:
+ print("Message-id must be specified")
+ optparser.print_help()
+ sys.exit(1)
+
+ cfg = ConfigParser()
+ cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
+ try:
+ connstr = cfg.get('db','connstr')
+ except:
+ connstr = 'need_connstr'
+
+ conn = psycopg2.connect(connstr)
+ curs = conn.cursor()
+
+ curs.execute("SELECT id, threadid FROM messages WHERE messageid=%(msgid)s", {
+ 'msgid': opt.msgid,
+ })
+ id, threadid = curs.fetchone()
+
+ VarnishPurger(cfg).purge([int(threadid), ])
+ conn.close()
from lib.varnish import VarnishPurger
def ResultIter(cursor):
- # Fetch lots of data but keep memory usage down a bit, by feeding it out of
- # a generator, and use fetchmany()
- while True:
- results = cursor.fetchmany(5000)
- if not results:
- break
- for r in results:
- yield r
+ # Fetch lots of data but keep memory usage down a bit, by feeding it out of
+ # a generator, and use fetchmany()
+ while True:
+ results = cursor.fetchmany(5000)
+ if not results:
+ break
+ for r in results:
+ yield r
if __name__ == "__main__":
- optparser = OptionParser()
- optparser.add_option('-m', '--msgid', dest='msgid', help='Messageid to load')
- optparser.add_option('--all', dest='all', action='store_true', help='Load *all* messages currently in the db')
- optparser.add_option('--sample', dest='sample', help='Load a sample of <n> messages')
- optparser.add_option('-v', '--verbose', dest='verbose', action='store_true', help='Verbose output')
- optparser.add_option('--force-date', dest='force_date', help='Override date (used for dates that can\'t be parsed)')
- optparser.add_option('--update', dest='update', action='store_true', help='Actually update, not just diff (default is diff)')
- optparser.add_option('--commit', dest='commit', action='store_true', help='Commit the transaction without asking')
-
- (opt, args) = optparser.parse_args()
-
- if (len(args)):
- print("No bare arguments accepted")
- optparser.print_usage()
- sys.exit(1)
-
- if sum([1 for x in [opt.all, opt.sample, opt.msgid] if x]) != 1:
- print("Must specify exactly one of --msgid, --all and --sample")
- sys.exit(1)
-
- if not opt.update and os.path.exists('reparse.diffs'):
- print("File reparse.diffs already exists. Remove or rename and try again.")
- sys.exit(1)
-
- log.set(opt.verbose)
-
- cfg = ConfigParser()
- cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
- try:
- connstr = cfg.get('db','connstr')
- except:
- connstr = 'need_connstr'
-
- conn = psycopg2.connect(connstr)
-
- # Get messages
- curs = conn.cursor('msglist')
- if opt.all:
- curs2 = conn.cursor()
- curs2.execute("SELECT count(*) FROM messages WHERE hiddenstatus IS NULL")
- totalcount, = curs2.fetchone()
- curs.execute("SELECT id, rawtxt FROM messages WHERE hiddenstatus IS NULL ORDER BY id")
- elif opt.sample:
- totalcount = int(opt.sample)
- curs.execute("SELECT id, rawtxt FROM messages WHERE hiddenstatus IS NULL ORDER BY id DESC LIMIT %(num)s", {
- 'num': int(opt.sample),
- })
- else:
- totalcount = 1
- curs.execute("SELECT id, rawtxt FROM messages WHERE messageid=%(msgid)s", {
- 'msgid': opt.msgid,
- })
-
- if not opt.update:
- f = codecs.open("reparse.diffs", "w", "utf-8")
- fromonlyf = open("reparse.fromonly","w")
-
- firststatus = datetime.now()
- laststatus = datetime.now()
- num = 0
- updated = 0
- for id, rawtxt in ResultIter(curs):
- num += 1
- ap = ArchivesParserStorage()
- ap.parse(BytesIO(rawtxt))
- try:
- ap.analyze(date_override=opt.force_date)
- except IgnorableException as e:
- if opt.update:
- print("Exception loading {0}: {1}".format(id, e))
- else:
- f.write("Exception loading %s: %s" % (id, e))
- continue
-
- if opt.update:
- if ap.store(conn, listid=-9, overwrite=True):
- updated += 1
- else:
- ap.diff(conn, f, fromonlyf, id)
- if datetime.now() - laststatus > timedelta(seconds=5):
- sys.stdout.write("%s messages parsed (%s%%, %s / second), %s updated\r" % (num,
- num*100/totalcount,
- num / ((datetime.now()-firststatus).seconds),
- updated))
- sys.stdout.flush()
- laststatus = datetime.now()
-
- print("")
-
- if opt.update:
- opstatus.print_status()
- if not opt.commit:
- while True:
- print("OK to commit transaction? ")
- a = input().lower().strip()
- if a == 'y' or a == 'yes':
- print("Ok, committing.")
- break
- elif a == 'n' or a == 'no':
- print("Aborting and rolling back")
- conn.rollback()
- sys.exit(1)
- conn.commit()
- VarnishPurger(cfg).purge(ap.purges)
- else:
- fromonlyf.close()
- f.close()
- if os.path.getsize('reparse.diffs') == 0:
- os.unlink('reparse.diffs')
- if os.path.getsize('reparse.fromonly') == 0:
- os.unlink('reparse.fromonly')
-
- # Just in case
- conn.rollback()
- conn.close()
+ optparser = OptionParser()
+ optparser.add_option('-m', '--msgid', dest='msgid', help='Messageid to load')
+ optparser.add_option('--all', dest='all', action='store_true', help='Load *all* messages currently in the db')
+ optparser.add_option('--sample', dest='sample', help='Load a sample of <n> messages')
+ optparser.add_option('-v', '--verbose', dest='verbose', action='store_true', help='Verbose output')
+ optparser.add_option('--force-date', dest='force_date', help='Override date (used for dates that can\'t be parsed)')
+ optparser.add_option('--update', dest='update', action='store_true', help='Actually update, not just diff (default is diff)')
+ optparser.add_option('--commit', dest='commit', action='store_true', help='Commit the transaction without asking')
+
+ (opt, args) = optparser.parse_args()
+
+ if (len(args)):
+ print("No bare arguments accepted")
+ optparser.print_usage()
+ sys.exit(1)
+
+ if sum([1 for x in [opt.all, opt.sample, opt.msgid] if x]) != 1:
+ print("Must specify exactly one of --msgid, --all and --sample")
+ sys.exit(1)
+
+ if not opt.update and os.path.exists('reparse.diffs'):
+ print("File reparse.diffs already exists. Remove or rename and try again.")
+ sys.exit(1)
+
+ log.set(opt.verbose)
+
+ cfg = ConfigParser()
+ cfg.read('%s/archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
+ try:
+ connstr = cfg.get('db','connstr')
+ except:
+ connstr = 'need_connstr'
+
+ conn = psycopg2.connect(connstr)
+
+ # Get messages
+ curs = conn.cursor('msglist')
+ if opt.all:
+ curs2 = conn.cursor()
+ curs2.execute("SELECT count(*) FROM messages WHERE hiddenstatus IS NULL")
+ totalcount, = curs2.fetchone()
+ curs.execute("SELECT id, rawtxt FROM messages WHERE hiddenstatus IS NULL ORDER BY id")
+ elif opt.sample:
+ totalcount = int(opt.sample)
+ curs.execute("SELECT id, rawtxt FROM messages WHERE hiddenstatus IS NULL ORDER BY id DESC LIMIT %(num)s", {
+ 'num': int(opt.sample),
+ })
+ else:
+ totalcount = 1
+ curs.execute("SELECT id, rawtxt FROM messages WHERE messageid=%(msgid)s", {
+ 'msgid': opt.msgid,
+ })
+
+ if not opt.update:
+ f = codecs.open("reparse.diffs", "w", "utf-8")
+ fromonlyf = open("reparse.fromonly","w")
+
+ firststatus = datetime.now()
+ laststatus = datetime.now()
+ num = 0
+ updated = 0
+ for id, rawtxt in ResultIter(curs):
+ num += 1
+ ap = ArchivesParserStorage()
+ ap.parse(BytesIO(rawtxt))
+ try:
+ ap.analyze(date_override=opt.force_date)
+ except IgnorableException as e:
+ if opt.update:
+ print("Exception loading {0}: {1}".format(id, e))
+ else:
+ f.write("Exception loading %s: %s" % (id, e))
+ continue
+
+ if opt.update:
+ if ap.store(conn, listid=-9, overwrite=True):
+ updated += 1
+ else:
+ ap.diff(conn, f, fromonlyf, id)
+ if datetime.now() - laststatus > timedelta(seconds=5):
+ sys.stdout.write("%s messages parsed (%s%%, %s / second), %s updated\r" % (num,
+ num*100/totalcount,
+ num / ((datetime.now()-firststatus).seconds),
+ updated))
+ sys.stdout.flush()
+ laststatus = datetime.now()
+
+ print("")
+
+ if opt.update:
+ opstatus.print_status()
+ if not opt.commit:
+ while True:
+ print("OK to commit transaction? ")
+ a = input().lower().strip()
+ if a == 'y' or a == 'yes':
+ print("Ok, committing.")
+ break
+ elif a == 'n' or a == 'no':
+ print("Aborting and rolling back")
+ conn.rollback()
+ sys.exit(1)
+ conn.commit()
+ VarnishPurger(cfg).purge(ap.purges)
+ else:
+ fromonlyf.close()
+ f.close()
+ if os.path.getsize('reparse.diffs') == 0:
+ os.unlink('reparse.diffs')
+ if os.path.getsize('reparse.fromonly') == 0:
+ os.unlink('reparse.fromonly')
+
+ # Just in case
+ conn.rollback()
+ conn.close()
from lib.storage import ArchivesParserStorage
if __name__ == "__main__":
- optparser = OptionParser()
- optparser.add_option('-m', dest='msgid', help='Messageid to edit')
- optparser.add_option('-i', dest='id', help='Message primary key id to edit')
- optparser.add_option('-c', dest='charset', help='Charset to edit as', default='utf8')
- optparser.add_option('--nodiff', dest='nodiff', action="store_true", help='Disable viewing of diff', default=False)
- (opt, args) = optparser.parse_args()
+ optparser = OptionParser()
+ optparser.add_option('-m', dest='msgid', help='Messageid to edit')
+ optparser.add_option('-i', dest='id', help='Message primary key id to edit')
+ optparser.add_option('-c', dest='charset', help='Charset to edit as', default='utf8')
+ optparser.add_option('--nodiff', dest='nodiff', action="store_true", help='Disable viewing of diff', default=False)
+ (opt, args) = optparser.parse_args()
- if (len(args)):
- print "No bare arguments accepted"
- optparser.print_usage()
- sys.exit(1)
+ if (len(args)):
+ print "No bare arguments accepted"
+ optparser.print_usage()
+ sys.exit(1)
- cfg = ConfigParser()
- cfg.read('%s/../archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
- try:
- connstr = cfg.get('db','connstr')
- except:
- connstr = 'need_connstr'
+ cfg = ConfigParser()
+ cfg.read('%s/../archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
+ try:
+ connstr = cfg.get('db','connstr')
+ except:
+ connstr = 'need_connstr'
- conn = psycopg2.connect(connstr)
- curs = conn.cursor()
+ conn = psycopg2.connect(connstr)
+ curs = conn.cursor()
- if not (opt.msgid or opt.id):
- print "Need -m or -i!"
- sys.exit(1)
- if opt.msgid and opt.id:
- print "Can't specify both -m and -i!"
- sys.exit(1)
+ if not (opt.msgid or opt.id):
+ print "Need -m or -i!"
+ sys.exit(1)
+ if opt.msgid and opt.id:
+ print "Can't specify both -m and -i!"
+ sys.exit(1)
- if opt.msgid:
- curs.execute("SELECT id, rawtxt FROM messages WHERE messageid=%(msgid)s", {
- 'msgid': opt.msgid,
- })
- else:
- curs.execute("SELECT id, rawtxt FROM messages WHERE id=%(id)s", {
- 'id': opt.id,
- })
+ if opt.msgid:
+ curs.execute("SELECT id, rawtxt FROM messages WHERE messageid=%(msgid)s", {
+ 'msgid': opt.msgid,
+ })
+ else:
+ curs.execute("SELECT id, rawtxt FROM messages WHERE id=%(id)s", {
+ 'id': opt.id,
+ })
- id, rawtxt = curs.fetchone()
- s = StringIO(rawtxt)
+ id, rawtxt = curs.fetchone()
+ s = StringIO(rawtxt)
- f = tempfile.NamedTemporaryFile(delete=False)
- try:
- f.write(s.getvalue())
- f.close()
- os.system("vim %s" % f.name)
- f2 = open(f.name, "rb")
- s2 = f2.read()
- f2.close()
+ f = tempfile.NamedTemporaryFile(delete=False)
+ try:
+ f.write(s.getvalue())
+ f.close()
+ os.system("vim %s" % f.name)
+ f2 = open(f.name, "rb")
+ s2 = f2.read()
+ f2.close()
- if not opt.nodiff:
- print "\n".join(difflib.unified_diff(s.getvalue().decode(opt.charset).splitlines(),
- s2.decode(opt.charset).splitlines(),
- fromfile='old',
- tofile='new',
- lineterm=''))
+ if not opt.nodiff:
+ print "\n".join(difflib.unified_diff(s.getvalue().decode(opt.charset).splitlines(),
+ s2.decode(opt.charset).splitlines(),
+ fromfile='old',
+ tofile='new',
+ lineterm=''))
- while True:
- a = raw_input('Save this to db?').lower()
- if a == 'y' or a == 'yes':
- curs.execute("INSERT INTO messages_edited SELECT * FROM messages WHERE id=%(id)s", {
- 'id': id,
- })
- curs.execute("UPDATE messages SET rawtxt=%(raw)s WHERE id=%(id)s", {
- 'id': id,
- 'raw': bytearray(s2),
- })
- conn.commit()
- break
- elif a == 'n' or a == 'no':
- print "Ok, not saving"
- break
+ while True:
+ a = raw_input('Save this to db?').lower()
+ if a == 'y' or a == 'yes':
+ curs.execute("INSERT INTO messages_edited SELECT * FROM messages WHERE id=%(id)s", {
+ 'id': id,
+ })
+ curs.execute("UPDATE messages SET rawtxt=%(raw)s WHERE id=%(id)s", {
+ 'id': id,
+ 'raw': bytearray(s2),
+ })
+ conn.commit()
+ break
+ elif a == 'n' or a == 'no':
+ print "Ok, not saving"
+ break
- finally:
- try:
- f.close()
- except:
- pass
- os.unlink(f.name)
+ finally:
+ try:
+ f.close()
+ except:
+ pass
+ os.unlink(f.name)
from lib.storage import ArchivesParserStorage
if __name__ == "__main__":
- cfg = ConfigParser()
- cfg.read('%s/../archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
- try:
- connstr = cfg.get('db','connstr')
- except:
- connstr = 'need_connstr'
-
- conn = psycopg2.connect(connstr)
- curs = conn.cursor()
-
- with open('fromlist', 'r') as f:
- for l in f:
- curs.execute("SAVEPOINT msg")
-
- msgid = l.strip()
- curs.execute("SELECT id, rawtxt, bodytxt FROM messages WHERE messageid=%(msgid)s", {
- 'msgid': msgid,
- })
- id, rawtxt, bodytxt = curs.fetchone()
-
- ap = ArchivesParserStorage()
- s = StringIO(rawtxt)
-
- # Parse the old message, so we can fix it.
- ap.parse(s)
- ap.analyze()
-
- # Double check...
- if bodytxt.decode('utf8') == ap.bodytxt:
- print "Message already fixed: %s" % msgid
- curs.execute("ROLLBACK TO SAVEPOINT msg")
- continue
-
- # Now try to fix it...
- s.seek(0)
-
- fixed = re.sub('^>From ', 'From ', s.getvalue(), flags=re.MULTILINE)
-
- curs.execute("UPDATE messages SET rawtxt=%(raw)s WHERE messageid=%(msgid)s", {
- 'msgid': msgid,
- 'raw': bytearray(fixed),
- })
-
- # Ok, read it back and try again
- curs.execute("SELECT id, rawtxt, bodytxt FROM messages WHERE messageid=%(msgid)s", {
- 'msgid': msgid,
- })
- id, rawtxt, bodytxt = curs.fetchone()
-
- ap = ArchivesParserStorage()
-
- # Parse the old message, so we can
- ap.parse(StringIO(rawtxt))
- ap.analyze()
-
- if ap.bodytxt != bodytxt.decode('utf8'):
- print "Failed to fix %s!" % msgid
-
- # Generate diff to show what we changed
- print "CHANGED:"
- print "\n".join(difflib.unified_diff(s.getvalue(),
- fixed,
- fromfile='old',
- tofile='new',
- n=2,
- lineterm=''))
- print "----"
- # Generate a diff to show what's left
- print "REMAINING:"
- print "\n".join(difflib.unified_diff(bodytxt.decode('utf8').splitlines(),
- ap.bodytxt.splitlines(),
- fromfile='old',
- tofile='new',
- n=2,
- lineterm=''))
- print "--------------"
- while True:
- a = raw_input('Save this change anyway?').lower()
- if a == 'y' or a == 'yes':
- print "Ok, saving!"
- curs.execute("RELEASE SAVEPOINT msg")
- break
- elif a == 'n' or a == 'no':
- print "Ok, rolling back!"
- curs.execute("ROLLBACK TO SAVEPOINT msg")
- break
- elif a == 'yq':
- print "Ok, committing and then exiting"
- curs.execute("RELEASE SAVEPOINT msg")
- conn.commit()
- conn.close()
- sys.exit(0)
- else:
- print "Fixed %s!" % msgid
- curs.execute("RELEASE SAVEPOINT msg")
- s.close()
-
- print "Committing all that's there..."
- conn.commit()
- conn.close()
+ cfg = ConfigParser()
+ cfg.read('%s/../archives.ini' % os.path.realpath(os.path.dirname(sys.argv[0])))
+ try:
+ connstr = cfg.get('db','connstr')
+ except:
+ connstr = 'need_connstr'
+
+ conn = psycopg2.connect(connstr)
+ curs = conn.cursor()
+
+ with open('fromlist', 'r') as f:
+ for l in f:
+ curs.execute("SAVEPOINT msg")
+
+ msgid = l.strip()
+ curs.execute("SELECT id, rawtxt, bodytxt FROM messages WHERE messageid=%(msgid)s", {
+ 'msgid': msgid,
+ })
+ id, rawtxt, bodytxt = curs.fetchone()
+
+ ap = ArchivesParserStorage()
+ s = StringIO(rawtxt)
+
+ # Parse the old message, so we can fix it.
+ ap.parse(s)
+ ap.analyze()
+
+ # Double check...
+ if bodytxt.decode('utf8') == ap.bodytxt:
+ print "Message already fixed: %s" % msgid
+ curs.execute("ROLLBACK TO SAVEPOINT msg")
+ continue
+
+ # Now try to fix it...
+ s.seek(0)
+
+ fixed = re.sub('^>From ', 'From ', s.getvalue(), flags=re.MULTILINE)
+
+ curs.execute("UPDATE messages SET rawtxt=%(raw)s WHERE messageid=%(msgid)s", {
+ 'msgid': msgid,
+ 'raw': bytearray(fixed),
+ })
+
+ # Ok, read it back and try again
+ curs.execute("SELECT id, rawtxt, bodytxt FROM messages WHERE messageid=%(msgid)s", {
+ 'msgid': msgid,
+ })
+ id, rawtxt, bodytxt = curs.fetchone()
+
+ ap = ArchivesParserStorage()
+
+ # Parse the old message, so we can
+ ap.parse(StringIO(rawtxt))
+ ap.analyze()
+
+ if ap.bodytxt != bodytxt.decode('utf8'):
+ print "Failed to fix %s!" % msgid
+
+ # Generate diff to show what we changed
+ print "CHANGED:"
+ print "\n".join(difflib.unified_diff(s.getvalue(),
+ fixed,
+ fromfile='old',
+ tofile='new',
+ n=2,
+ lineterm=''))
+ print "----"
+ # Generate a diff to show what's left
+ print "REMAINING:"
+ print "\n".join(difflib.unified_diff(bodytxt.decode('utf8').splitlines(),
+ ap.bodytxt.splitlines(),
+ fromfile='old',
+ tofile='new',
+ n=2,
+ lineterm=''))
+ print "--------------"
+ while True:
+ a = raw_input('Save this change anyway?').lower()
+ if a == 'y' or a == 'yes':
+ print "Ok, saving!"
+ curs.execute("RELEASE SAVEPOINT msg")
+ break
+ elif a == 'n' or a == 'no':
+ print "Ok, rolling back!"
+ curs.execute("ROLLBACK TO SAVEPOINT msg")
+ break
+ elif a == 'yq':
+ print "Ok, committing and then exiting"
+ curs.execute("RELEASE SAVEPOINT msg")
+ conn.commit()
+ conn.close()
+ sys.exit(0)
+ else:
+ print "Fixed %s!" % msgid
+ curs.execute("RELEASE SAVEPOINT msg")
+ s.close()
+
+ print "Committing all that's there..."
+ conn.commit()
+ conn.close()