from BeautifulSoup import BeautifulSoup import re import urllib2 HACKAGE_PKG_PREFIX = 'http://hackage.haskell.org/package/' PKGDB_PREFIX= 'https://apps.fedoraproject.org/packages/' ERROR_URL = PKGDB_PREFIX + 'error' def find_deps(pkg, knowns=set(), founds=set()): print "DEBUG, checking", pkg print "DEBUG", knowns, founds if pkg in knowns: print "package is a known dependency" return (knowns, founds) elif pkg in founds: print "package already exists" return (knowns, founds) # else check the package res = urllib2.urlopen(PKGDB_PREFIX + 'ghc-' + pkg) if res.url != ERROR_URL: return (knowns, founds.union(set([pkg]))) knowns.add(pkg) # package not found, check its deps page = urllib2.urlopen(HACKAGE_PKG_PREFIX + pkg) soup = BeautifulSoup(page) deps = [d.text for d in soup.find('th', text='Dependencies').next() if d.name == 'a'] print "skipping known deps:", set(deps).intersection(knowns) print "... and found deps:", set(deps).intersection(founds) unknowns = set(deps).difference(knowns).difference(founds) print "unknown deps for", pkg, ":", unknowns # recursively check their dependencies for u in unknowns: if u in knowns or u in founds: # check again since after each serial recursion, # p might be added to a set already print "skipping", u continue (knowns1, founds1) = find_deps(u, knowns, founds) knowns = knowns.union(knowns1) founds = founds.union(founds1) return (knowns, founds)