#!/usr/bin/python -tt # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # take a definitive repomd.xml # compare it to set of them retrieved from mirrors # specifically compare the timestamp on the primary.xml on each # output list of good mirrors for a given path. debug = True # TODO: # better error handling # push into a db? # read from a config file for various mirrors # config info: # mirrorlist-input-file # mirrorlist-output-path # path-to-repodata # archlist # canonical repo's baseurlfor this mirrorlist # timeout for mirror check # close-enough time for 'good' mirrors #use geoip module to figure out country of-origin per-mirror and write # out per-country lists as well as a global list. import os import sys import re import exceptions from cElementTree import iterparse import GeoIP import ConfigParser import socket import urlparse from urlgrabber.grabber import URLGrabber from urlgrabber.grabber import URLGrabError class YumBaseError(exceptions.Exception): def __init__(self, args=None): exceptions.Exception.__init__(self) self.args = args class RepoMDError(YumBaseError): def __init__(self, args=None): YumBaseError.__init__(self) self.args = args def ns_cleanup(qn): if qn.find('}') == -1: return qn return qn.split('}')[1] def errorprint(stuff): print >> sys.stderr, stuff def check_and_make_dir(dir): """ check out the dir and make it, if possible, return 1 if done, else return 0 """ if os.path.exists(dir): if not os.path.isdir(dir): errorprint('%s is not a dir' % dir) result = False else: if not os.access(dir, os.W_OK): errorprint('%s is not writable' % dir) result = False else: result = True else: try: os.mkdir(dir) except OSError, e: errorprint('Error creating dir %s: %s' % (dir, e)) result = False else: result = True return result class RepoData: """represents anything beneath a tag""" def __init__(self, elem): self.type = elem.attrib.get('type') self.location = (None, None) self.checksum = (None,None) # type,value self.openchecksum = (None,None) # type,value self.timestamp = None self.parse(elem) def parse(self, elem): for child in elem: child_name = ns_cleanup(child.tag) if child_name == 'location': relative = child.attrib.get('href') base = child.attrib.get('base') self.location = (base, relative) elif child_name == 'checksum': csum_value = child.text csum_type = child.attrib.get('type') self.checksum = (csum_type,csum_value) elif child_name == 'open-checksum': csum_value = child.text csum_type = child.attrib.get('type') self.openchecksum = (csum_type, csum_value) elif child_name == 'timestamp': self.timestamp = child.text class RepoMD: """represents the repomd xml file""" def __init__(self, repoid, srcfile): """takes a repoid and a filename for the repomd.xml""" self.repoid = repoid self.repoData = {} if type(srcfile) == type('str'): # srcfile is a filename string infile = open(srcfile, 'rt') else: # srcfile is a file object infile = srcfile parser = iterparse(infile) try: for event, elem in parser: elem_name = ns_cleanup(elem.tag) if elem_name == "data": thisdata = RepoData(elem=elem) self.repoData[thisdata.type] = thisdata except SyntaxError, e: raise RepoMDError, "Damaged repomd.xml file" def fileTypes(self): """return list of metadata file types available""" return self.repoData.keys() def getData(self, type): if self.repoData.has_key(type): return self.repoData[type] else: raise RepoMDError, "Error: requested datatype %s not available" % type def dump(self): """dump fun output""" for ft in self.fileTypes(): thisdata = self.repoData[ft] print 'datatype: %s' % thisdata.type print 'location: %s %s' % thisdata.location print 'timestamp: %s' % thisdata.timestamp print 'checksum: %s -%s' % thisdata.checksum print 'open checksum: %s - %s' % thisdata.openchecksum class MirrorContainer(object): def __init__(self, url, grabber, archlist, gi): self.url = url self.grabber = grabber self.geoip = gi self.timestamps = {} self.archlist = archlist self.country = None self.get_timestamp(url) self.get_country(url) def get_timestamp(self, url): url = '%s/repodata/repomd.xml' % url (suburl, count) = re.subn('\$ARCH', '$BASEARCH', url) (suburl, count) = re.subn('\$BASEARCH','$basearch', suburl) for arch in self.archlist: (finurl, count) = re.subn('\$basearch', arch, suburl) try: fo = self.grabber.urlopen(finurl) except URLGrabError, e: print 'error on %s' % finurl continue try: p = RepoMD('fooid', fo) except RepoMDError, e: print e continue else: thisdata = p.repoData['primary'] self.timestamps[arch] = thisdata.timestamp del p fo.close() del fo def get_country(self, url): # unparse url # resolve out ip # get county by addr url_parts = urlparse.urlparse(url) h = url_parts[1] addr = socket.gethostbyname(h) self.country = self.geoip.country_code_by_addr(addr) class MirrorListInfo(object): def __init__(self): self.archlist = ['i386', 'x86_64', 'ppc'] self.mirrorid = None self.inputfile = None self.outputpath = None self.timeout = 10 self.canonical = None self.mirrorlist = [] def populate_mirrorlist(self): try: fo = open(self.inputfile, 'r') except IOError, e: return else: for url in fo.readlines(): url = url.replace('\n','') self.mirrorlist.append(url) fo.close() def config(cfg): sections = [] conf = ConfigParser.ConfigParser() conf.read(cfg) for section in conf.sections(): item = MirrorListInfo() item.mirrorid = '%s' % section broken = False if conf.has_option(section, 'inputfile'): item.inputfile = conf.get(section, 'inputfile') else: errorprint('missing inputfile') broken = True if conf.has_option(section, 'outputpath'): item.outputpath = conf.get(section, 'outputpath') else: errorprint('missing outputpath') broken = True if conf.has_option(section, 'canonical'): item.canonical = conf.get(section, 'canonical') else: errorprint('missing canonical url') broken = True if broken: errorprint("Broooooooooooooken config, in section %s, bailing" % section) sys.exit(1) if conf.has_option(section, 'timeout'): item.timeout = conf.getint(section, 'timeout') if conf.has_option(section, 'archlist'): a_string = conf.get(section, 'archlist') a_holder = a_string.replace('\n', ' ') a_holder = a_holder.replace(',', ' ') a_list = a_holder.split() item.archlist = a_list sections.append(item) return sections def main(cfg_file): sections = config(cfg_file) mirrors = [] gi = GeoIP.new(GeoIP.GEOIP_STANDARD) # grab the canonical mirrors info for s in sections: s.populate_mirrorlist() if len(s.mirrorlist) < 1: errorprint("no mirrors to look at for %s, something is broken, skipping" % s.mirrorid) continue if not check_and_make_dir(s.outputpath): errorprint('Error creating output path %s for %s' % (s.outputpath, s.mirrorid)) continue ug = URLGrabber(timeout=s.timeout) canon = MirrorContainer(s.canonical, ug, s.archlist, gi) if len(canon.timestamps.keys()) < len(s.archlist): # if we can't get info for all arches for the canonical mirror, exit errorprint("Cannot contact canonical host for all archs, skipping") continue if debug: # debug only - just printing out info for arch in s.archlist: if canon.timestamps.has_key(arch): print ' %s: %s' % (arch, canon.timestamps[arch]) # get the info for all the mirrors for url in s.mirrorlist: m = MirrorContainer(url, ug, s.archlist, gi) if m: mirrors.append(m) # output should be: # s.outputpath/s.mirrorid-$country-$arch.txt # s.outputpath/s.mirrorid-global-$arch.txt # print them out per-arch for arch in s.archlist: glob = '%s/%s-global-%s.txt' % (s.outputpath, s.mirrorid, arch) glob_fo = open(glob, 'w+') for m in mirrors: if m.timestamps.has_key(arch): if m.timestamps[arch] == canon.timestamps[arch]: glob_fo.write('%s\n' % m.url) if m.country: country = '%s/%s-%s-%s.txt' % (s.outputpath, s.mirrorid, m.country, arch) country_fo = open(country, 'a') country_fo.write('%s\n' % m.url) country_fo.close() glob_fo.close() if __name__ == '__main__': main(sys.argv[1])