#!/usr/bin/env python import sys import os import traceback import httplib import itertools import mwclient import config import views import linkback_reader import ccquery from utils import tries WIKI_HOST = 'monitor.creativecommons.org' WIKI_PATH = '/' BOT_NAME = 'CCStatsBot' BOT_PASS = 'bhyccstatsbot' DB_FILE = config.DB_FILE class WikiBot(object): def __init__(self, filter = None): site = mwclient.Site(WIKI_HOST, WIKI_PATH) self.site = site self.login() if filter is None: filter = lambda x: True self.filter = filter return def login(self): self.site.login(BOT_NAME, BOT_PASS) return def upload(self, file_name, content, comment=''): print "Uploading file: ", file_name, "...", sys.stdout.flush() if isinstance(content, unicode): content = content.encode('utf8') site = self.site try: tries( 3, lambda: site.upload(content, file_name, comment, ignore=True) ) except httplib.IncompleteRead: # The upload is success even we got an IncompleteRead pass try: uploaded = site.Images[file_name] except: # This will be failed... don't know why. pass uploaded = site.Images[file_name] print 'Done.' return uploaded def put_page(self, title, content): page = self.site.Pages[title] page.save(content) return page def get_page(self, title): page = self.site.Pages[title] return page def update_pages(self, pages): for page in pages: if not page.text: # Do nothing if the page content is empty continue if not self.filter(page): continue print "Updating page: ", page.title, "...", sys.stdout.flush() try: self.put_page(page.title, page.text) print "Done." #except mwclient.MwClientError: except: print "Error when updating page:" print '-'*60 traceback.print_exc() print '-'*60 # reset connection pool and continue to next task del self.site.connection[:] self.login() return def upload_files(self, files, seturl_callback): for file in files: if self.filter(file): uploaded = self.upload(file.title, file.text) else: uploaded = self.site.Images[file.title] url = uploaded.imageinfo[u'url'] seturl_callback(file.title, url) return def update_wiki(args = (), query=None): if query is None: query = ccquery.CCQuery(DB_FILE) # setup filter if args: if args[0]=='-x': exclude = True args = args[1:] else: exclude = False args = [a.lower() for a in args] def _filter(x): title = x.title.lower() for a in args: if a in title: # return True if '-x' not set, otherwise False return not exclude return exclude else: _filter = lambda x: True bot = WikiBot(filter=_filter) view = views.View(query) filegen = view.all_files() bot.upload_files(filegen, view.set_uploaded_url) pagegen = view.all_pages() bot.update_pages(pagegen) def new_userpages(): pagegen = view.all_userpages() for page in pagegen: wikipage = bot.get_page(page.title) if not wikipage.exists: yield page else: print "Purging ", page.title wikipage.purge() return bot.update_pages(new_userpages()) return def update_wikiuserpages(query=None): if query is None: query = ccquery.CCQuery(DB_FILE) view = views.View(query) bot = WikiBot() print "WARNING: This operation is dangerous because it will overwite all existing user content. Please input 'YES I KNOW' to continue." know = raw_input() if know != 'YES I KNOW': return pagegen = view.all_userpages() bot.update_pages(pagegen) return def update_db(filter_filename=None): data = linkback_reader.most_recent() query = ccquery.CCQuery(DB_FILE) if filter_filename is not None: juris_filter = set(line.strip().lower() for line in open(filter_filename)) data = itertools.ifilter(lambda x: x[5] in juris_filter, data) for juris in juris_filter: query.del_linkbacks(juris) else: query.del_all_linkbacks() query.add_linkbacks(data) return query def update_all(): query = update_db() update_wiki(query = query) return def _get_table_file(name): if name[-4:].lower()=='.csv': name = name[:-4] return name, name+'.csv' def upload(*files): bot = WikiBot() for file in files: content = open(file, 'rb').read() upload_name = os.path.basename(file) bot.upload(upload_name, content) return def export_db(table, file=None): if file is None: table, file = _get_table_file(table) q = ccquery.CCQuery(DB_FILE) q.export_table(table, open(file,'w')) return def import_db(table, file=None): if file is None: table, file = _get_table_file(table) q = ccquery.CCQuery(DB_FILE) q.import_table(table, open(file)) return def test(): bot = WikiBot() bot.put_page('Sandbox', 'Testing Sandbox.. by ccbot.py') TEST_XML = """ """ uploaded = bot.upload('test999.xml', TEST_XML, 'testing upload.. again and agina.') print "Info of uploaded test file:", uploaded.imageinfo print "Test OK!" return def usage(): print """ ccbot.py [command] [args...] With no command, it will fetch most recently data and update both the DB and wiki. The followling command is available: db []: fetch the most recently data and update the DB. An optional jurisdiction filter filename can be provided. The filter is a list of jurisdictions one per line. Only the jurisdictions listed in this file will be updated. If this file is not given, the entire database will be updated. wiki [-x] [...]: update the wiki pages from DB data. Only the pages which is used for robot produced contents will be updated. User produced contents will be untouched. If is present, then only pages that contain at least one of the given keywords in title will be updated. If -x option is given, pages that contain given keywords in title will be excluded from update. upload: upload files to wiki. wikiuserpages: initialize all the user content pages. Warning: This will clean all the existing user contents. import []: import DB table from CSV file. export
[]: export DB table from CSV file. test: test the connection to wiki site. """ def main(*args): if len(args)==0: update_all() elif args[0]=='db': update_db(*args[1:]) elif args[0]=='wiki': update_wiki(args[1:]) elif args[0]=='upload': upload(*args[1:]) elif args[0]=='wikiuserpages': update_wikiuserpages() elif args[0]=='import': import_db(*args[1:]) elif args[0]=='export': export_db(*args[1:]) elif args[0]=='test': test() else: usage() return if __name__=='__main__': import sys main(*sys.argv[1:])