try: import psyco # :-) except ImportError: pass # gee whiz! :-( import sisynala.main import datetime import urllib import re import StringIO import collections, itertools def format_list(l): growing = StringIO.StringIO() for elt in l: val, thing = elt print >>growing, '%s: %d' % (thing, val) return growing.getvalue() # FIXME: This code is highly repetitive. :-) # FIXME: I have stderr and stdout mixed up or something? # Source: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/304373 class peekable: """An iterator that supports a peek operation. """ def __init__(self, iterable): self._iterable = iter(iterable) self._cache = collections.deque() def __iter__(self): return self def _fillcache(self, n): while len(self._cache) < n: self._cache.append(self._iterable.next()) def next(self, n=None): self._fillcache(n is None and 1 or n) if n is None: result = self._cache.popleft() else: result = [self._cache.popleft() for i in range(n)] return result def peek(self, n=None): self._fillcache(n is None and 1 or n) if n is None: result = self._cache[0] else: result = [self._cache[i] for i in range(n)] return result def apachetime2datetime(s): date_and_time, offset = s.split(' ', 1) date, time = date_and_time.split(':', 1) day, monthname, year = date.split('/') # FIXME: What am I *doing* parsing months by myself? month = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}[monthname] day, year = int(day), int(year) hour, minute, second = map(int, time.split(':')) offset = offset.strip() direction, h_off, m_off = offset[0], int(offset[1:3]), int(offset[3:]) offset_delta = datetime.timedelta(minutes=m_off, hours=h_off) pre_offset = datetime.datetime(year, month, day, hour, minute, second) if direction == '+': # Note: - means +, + means -! return pre_offset - offset_delta elif direction == '-': return pre_offset + offset_delta else: raise AssertionError, "That's not an offset at all!" def parseSquidLine(s): ret = {} s = s.strip() if not s: return ret parts = s.split() parts = [ k.strip() for k in parts ] datestring, trash, ip, response, bytescount, method, uri, referer, upstreamserver, mimetype = parts # FIXME: What's "trash"? It looks like a number of some kind. date = datetime.datetime.fromtimestamp(float(datestring)) return {'date': date, 'ip': ip, 'response': response, 'bytescount': int(bytescount), 'method': method, 'uri': uri, 'referer': referer, 'upstreamserver': upstreamserver, 'mimetype': mimetype} def parseApacheLine(s): ''' Input is in Combined Log Format. ''' nearly = sisynala.main.logline(s) nearly['method'] = splitted = nearly['url'].split(' ', 1)[0] # Turns out we grep away anything not GET anyway, so 'method' never got tested nearly['date'] = apachetime2datetime(nearly['date']) return nearly def lineYielder(fn, fd): for line in fd: yield fn(line) def logfixer(squidfd=None, apachefd=None): if squifd is None: squidfd=open('../logs/squid-questionmark') if apachefd is None: apachefd=open('../logs/apache-get') squidYielder = peekable(lineYielder(parseSquidLine, squidfd)) apacheYielder = lineYielder(parseApacheLine, apachefd) apacheYielder = peekable(apacheYielder) print 'okay' # These should yield in time-sorted order, later after sooner # How to cross-reference? # We want to update the Squid line's IP with the squid line's IP # We start with a Squid line in hand patchy = apacheYielder.next() squiddie = squidYielder.next() while (squiddie is not None) and (patchy is not None): # process squiddie and patchy knowing we can move on if we want while squiddie: # First and foremost, if the patchy is older than the # *next* squiddie, we need a new squiddy. if patchy['date'] > squidYielder.peek()['date']: squiddie = squidYielder.next() print '.', continue # is the squiddie for search.creativecommons.org? if not 'http://search.creativecommons.org' in squiddie['uri']: squiddie = squidYielder.next() print ',', continue # Does this patchy match? patchy_uri_bit = patchy['uri'].split('?', 1)[0] if '?' in patchy['uri']: patchy_uri_bit += '?' squiddie_uri_bit = squiddie['uri'][36:] # 33 == len('http://search.cc.org:81') if squiddie_uri_bit == patchy_uri_bit: patchy['ip'] = squiddie['ip'] yield patchy patchy = apacheYielder.next() print ':-)' #we_care.append(patchy) continue else: patchy = apacheYielder.next() print '?', continue def unique_ips_that_use_mozilla_search(l): ret = set() for thing in l: if 'sourceid=Mozilla-search' in thing['url']: ret.add(thing['ip']) return ret def total_searches_count(l): ret = 0 for thing in l: if 'sourceid=Mozilla-search' in thing['url']: ret += 1 return ret def unique_ips_that_did_not_use_mozilla_search(l): ret = set() for thing in l: if not ('sourceid=Mozilla-search' in thing['url']): ret.add(thing['ip']) return ret def unique_ips_that_search_more_than_once(l): data = {} for thing in l: if 'sourceid=Mozilla-search' in thing['url']: # FIXME: Is there a better API to all this? path, fullquery = urllib.splitquery(thing['url']) sections = fullquery.split('&') for section in sections: if section: question, answer = map(urllib.unquote_plus, section.split('=', 1)) if question == 'q': data[thing['ip']] = data.get(thing['ip'], 0) + 1 ret = [] for ip in data: if data[ip] > 1: ret.append(ip) return ret def unique_ips_that_sometimes_use_mozilla_sometimes_not(i, j): users = unique_ips_that_use_mozilla_search(i) non_users = unique_ips_that_did_not_use_mozilla_search(j) ret = set() for user in users: if user in non_users: ret.add(user) return ret def top_search_terms(l): ''' Separated by whitespace. ''' data = {} for thing in l: if 'sourceid=Mozilla-search' in thing['url']: # FIXME: Is there a better API to all this? path, fullquery = urllib.splitquery(thing['url']) sections = fullquery.split('&') for section in sections: if section: question, answer = map(urllib.unquote_plus, section.split('=', 1)) if question == 'q': terms = re.split(r'[^a-zA-Z]+', answer) for term in terms: term = term.strip().lower() if term: data[term] = data.get(term, 0) + 1 # Now sorted? biglist = [ (data[term], term) for term in data ] biglist.sort() return biglist def top_searches(l): ''' Raw! ''' data = {} for thing in l: if 'sourceid=Mozilla-search' in thing['url']: # FIXME: Is there a better API to all this? path, fullquery = urllib.splitquery(thing['url']) sections = fullquery.split('&') for section in sections: if section: question, answer = map(urllib.unquote_plus, section.split('=', 1)) if question == 'q': terms = answer.strip().lower() if terms: data[terms] = data.get(terms, 0) + 1 # Now sorted? biglist = [ (data[term], term) for term in data ] biglist.sort() return biglist if __name__ == '__main__': import sys squidsrc = sys.argv[1] apachesrc = sys.argv[2] print >> sys.stderr, 'sourceid=Mozilla-search reports:' i = iter(logfixer(open(squidsrc), open(apachesrc))) print >> sys.stderr, 'unique ip count is:' print >> sys.stderr, ' ', len(unique_ips_that_use_mozilla_search(i)) i = iter(logfixer(open(squidsrc), open(apachesrc))) print >> sys.stderr, 'total number of searches:' print >> sys.stderr, ' ', total_searches_count(i) print >> sys.stderr, 'unique ip count that searches >= twice:' i = iter(logfixer(open(squidsrc), open(apachesrc))) print >> sys.stderr, " ", len(unique_ips_that_search_more_than_once(i)) print >> sys.stderr, "All ASCII words searched for individually, sorted by hits:" i = iter(logfixer(open(squidsrc), open(apachesrc))) print >> sys.stderr, format_list(top_search_terms(i)) print >> sys.stderr, "all searches sorted by hits:" i = iter(logfixer(open(squidsrc), open(apachesrc))) print >> sys.stderr, format_list(top_searches(i)) print >> sys.stderr, 'unique ip count that sometimes uses Mozilla-search sometimes not:' i = iter(logfixer(open(squidsrc), open(apachesrc))) j = iter(logfixer(open(squidsrc), open(apachesrc))) print >> sys.stderr, " ", len(unique_ips_that_sometimes_use_mozilla_sometimes_not(i, j))