[python-urlgrabber/f18] Update to latest HEAD.

Thu Dec 6 11:12:32 UTC 2012

commit 68399901bcd148be71c533cb5e2e6416be0cad7d
Author: Zdeněk Pavlas <zpavlas at redhat.com>
Date:   Thu Dec 6 12:12:23 2012 +0100

    Update to latest HEAD.

 python-urlgrabber.spec |    6 ++-
 urlgrabber-HEAD.patch  |  103 +++++++++++++++++++++++++++++++-----------------
 2 files changed, 72 insertions(+), 37 deletions(-)
---

diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec
index 8a6f37e..6ee5dd6 100644
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.1
-Release: 22%{?dist}
+Release: 23%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch1: urlgrabber-HEAD.patch
 
@@ -44,6 +44,10 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down
 
 %changelog
+* Thu Dec  6 2012 Zdeněk Pavlas <zpavlas at redhat.com> - 3.9.1-23
+- Update to latest HEAD.
+- Improve URLGRABBER_DEBUG, add max_connections.  BZ 853432
+
 * Thu Nov  1 2012 Zdeněk Pavlas <zpavlas at redhat.com> - 3.9.1-22
 - Update to latest HEAD
 - Get rid of "HTTP 200 OK" errors.  BZ 871835.
diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch
index 55c3ba4..bc028de 100644
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@@ -236,7 +236,7 @@ index 3e5f3b7..8eeaeda 100644
      return (fb,lb)
  
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..74a692c 100644
+index e090e90..78c2e59 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
 @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
@@ -458,7 +458,26 @@ index e090e90..74a692c 100644
  ########################################################################
  # functions for debugging output.  These functions are here because they
  # are also part of the module initialization.
-@@ -527,6 +608,29 @@ def _(st):
+@@ -504,6 +585,7 @@ def _init_default_logger(logspec=None):
+         else:  handler = logging.FileHandler(filename)
+         handler.setFormatter(formatter)
+         DBOBJ = logging.getLogger('urlgrabber')
++        DBOBJ.propagate = False
+         DBOBJ.addHandler(handler)
+         DBOBJ.setLevel(level)
+     except (KeyError, ImportError, ValueError):
+@@ -512,8 +594,8 @@ def _init_default_logger(logspec=None):
+ 
+ def _log_package_state():
+     if not DEBUG: return
+-    DEBUG.info('urlgrabber version  = %s' % __version__)
+-    DEBUG.info('trans function "_"  = %s' % _)
++    DEBUG.debug('urlgrabber version  = %s' % __version__)
++    DEBUG.debug('trans function "_"  = %s' % _)
+         
+ _init_default_logger()
+ _log_package_state()
+@@ -527,6 +609,29 @@ def _(st):
  #                 END MODULE INITIALIZATION
  ########################################################################
  
@@ -488,7 +507,7 @@ index e090e90..74a692c 100644
  
  
  class URLGrabError(IOError):
-@@ -662,6 +766,7 @@ class URLParser:
+@@ -662,6 +767,7 @@ class URLParser:
            opts.quote = 0     --> do not quote it
            opts.quote = None  --> guess
          """
@@ -496,7 +515,7 @@ index e090e90..74a692c 100644
          quote = opts.quote
          
          if opts.prefix:
-@@ -768,6 +873,41 @@ class URLGrabberOptions:
+@@ -768,6 +874,41 @@ class URLGrabberOptions:
          else: # throttle is a float
              return self.bandwidth * self.throttle
          
@@ -538,7 +557,7 @@ index e090e90..74a692c 100644
      def derive(self, **kwargs):
          """Create a derived URLGrabberOptions instance.
          This method creates a new instance and overrides the
-@@ -791,30 +931,37 @@ class URLGrabberOptions:
+@@ -791,30 +932,37 @@ class URLGrabberOptions:
          provided here.
          """
          self.progress_obj = None
@@ -577,7 +596,7 @@ index e090e90..74a692c 100644
          self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
          self.ssl_context = None # no-op in pycurl
          self.ssl_verify_peer = True # check peer's cert for authenticityb
-@@ -827,6 +974,12 @@ class URLGrabberOptions:
+@@ -827,6 +975,12 @@ class URLGrabberOptions:
          self.size = None # if we know how big the thing we're getting is going
                           # to be. this is ultimately a MAXIMUM size for the file
          self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
@@ -590,7 +609,7 @@ index e090e90..74a692c 100644
          
      def __repr__(self):
          return self.format()
-@@ -846,7 +999,18 @@ class URLGrabberOptions:
+@@ -846,7 +1000,18 @@ class URLGrabberOptions:
          s = s + indent + '}'
          return s
  
@@ -610,7 +629,7 @@ index e090e90..74a692c 100644
      """Provides easy opening of URLs with a variety of options.
      
      All options are specified as kwargs. Options may be specified when
-@@ -872,7 +1036,6 @@ class URLGrabber:
+@@ -872,7 +1037,6 @@ class URLGrabber:
              # beware of infinite loops :)
              tries = tries + 1
              exception = None
@@ -618,7 +637,7 @@ index e090e90..74a692c 100644
              callback  = None
              if DEBUG: DEBUG.info('attempt %i/%s: %s',
                                   tries, opts.retry, args[0])
-@@ -883,54 +1046,62 @@ class URLGrabber:
+@@ -883,54 +1047,62 @@ class URLGrabber:
              except URLGrabError, e:
                  exception = e
                  callback = opts.failure_callback
@@ -688,7 +707,7 @@ index e090e90..74a692c 100644
          if scheme == 'file' and not opts.copy_local:
              # just return the name of the local file - don't make a 
              # copy currently
-@@ -950,41 +1121,51 @@ class URLGrabber:
+@@ -950,41 +1122,51 @@ class URLGrabber:
  
              elif not opts.range:
                  if not opts.checkfunc is None:
@@ -755,7 +774,7 @@ index e090e90..74a692c 100644
          if limit is not None:
              limit = limit + 1
              
-@@ -1000,12 +1181,8 @@ class URLGrabber:
+@@ -1000,12 +1182,8 @@ class URLGrabber:
                  else: s = fo.read(limit)
  
                  if not opts.checkfunc is None:
@@ -770,7 +789,7 @@ index e090e90..74a692c 100644
              finally:
                  fo.close()
              return s
-@@ -1020,6 +1197,7 @@ class URLGrabber:
+@@ -1020,6 +1198,7 @@ class URLGrabber:
          return s
          
      def _make_callback(self, callback_obj):
@@ -778,7 +797,7 @@ index e090e90..74a692c 100644
          if callable(callback_obj):
              return callback_obj, (), {}
          else:
-@@ -1030,7 +1208,7 @@ class URLGrabber:
+@@ -1030,7 +1209,7 @@ class URLGrabber:
  default_grabber = URLGrabber()
  
  
@@ -787,7 +806,7 @@ index e090e90..74a692c 100644
      def __init__(self, url, filename, opts):
          self.fo = None
          self._hdr_dump = ''
-@@ -1052,10 +1230,13 @@ class PyCurlFileObject():
+@@ -1052,10 +1231,13 @@ class PyCurlFileObject():
          self._reget_length = 0
          self._prog_running = False
          self._error = (None, None)
@@ -803,7 +822,7 @@ index e090e90..74a692c 100644
      def __getattr__(self, name):
          """This effectively allows us to wrap at the instance level.
          Any attribute not found in _this_ object will be searched for
-@@ -1067,6 +1248,12 @@ class PyCurlFileObject():
+@@ -1067,6 +1249,12 @@ class PyCurlFileObject():
  
      def _retrieve(self, buf):
          try:
@@ -816,7 +835,7 @@ index e090e90..74a692c 100644
              if not self._prog_running:
                  if self.opts.progress_obj:
                      size  = self.size + self._reget_length
-@@ -1079,15 +1266,24 @@ class PyCurlFileObject():
+@@ -1079,15 +1267,24 @@ class PyCurlFileObject():
                      self.opts.progress_obj.update(self._amount_read)
  
              self._amount_read += len(buf)
@@ -843,7 +862,7 @@ index e090e90..74a692c 100644
          try:
              self._hdr_dump += buf
              # we have to get the size before we do the progress obj start
-@@ -1104,7 +1300,17 @@ class PyCurlFileObject():
+@@ -1104,7 +1301,17 @@ class PyCurlFileObject():
                      s = parse150(buf)
                  if s:
                      self.size = int(s)
@@ -857,12 +876,12 @@ index e090e90..74a692c 100644
 +                
 +            if len(self._hdr_dump) != 0 and buf == '\r\n':
 +                self._hdr_ended = True
-+                if DEBUG: DEBUG.info('header ended:')
++                if DEBUG: DEBUG.debug('header ended:')
 +                
              return len(buf)
          except KeyboardInterrupt:
              return pycurl.READFUNC_ABORT
-@@ -1113,8 +1319,10 @@ class PyCurlFileObject():
+@@ -1113,8 +1320,10 @@ class PyCurlFileObject():
          if self._parsed_hdr:
              return self._parsed_hdr
          statusend = self._hdr_dump.find('\n')
@@ -873,7 +892,7 @@ index e090e90..74a692c 100644
          self._parsed_hdr =  mimetools.Message(hdrfp)
          return self._parsed_hdr
      
-@@ -1127,6 +1335,9 @@ class PyCurlFileObject():
+@@ -1127,6 +1336,9 @@ class PyCurlFileObject():
          if not opts:
              opts = self.opts
  
@@ -883,13 +902,14 @@ index e090e90..74a692c 100644
  
          # defaults we're always going to set
          self.curl_obj.setopt(pycurl.NOPROGRESS, False)
-@@ -1136,11 +1347,21 @@ class PyCurlFileObject():
+@@ -1136,11 +1348,21 @@ class PyCurlFileObject():
          self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
          self.curl_obj.setopt(pycurl.FAILONERROR, True)
          self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
 +        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
          
-         if DEBUG:
+-        if DEBUG:
++        if DEBUG and DEBUG.level <= 10:
              self.curl_obj.setopt(pycurl.VERBOSE, True)
          if opts.user_agent:
              self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
@@ -905,7 +925,7 @@ index e090e90..74a692c 100644
          
          # maybe to be options later
          self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
-@@ -1148,9 +1369,11 @@ class PyCurlFileObject():
+@@ -1148,9 +1370,11 @@ class PyCurlFileObject():
          
          # timeouts
          timeout = 300
@@ -920,7 +940,7 @@ index e090e90..74a692c 100644
  
          # ssl options
          if self.scheme == 'https':
-@@ -1158,13 +1381,16 @@ class PyCurlFileObject():
+@@ -1158,13 +1382,16 @@ class PyCurlFileObject():
                  self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
                  self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
              self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
@@ -938,7 +958,7 @@ index e090e90..74a692c 100644
              if opts.ssl_cert_type:                
                  self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
              if opts.ssl_key_pass:
-@@ -1187,28 +1413,26 @@ class PyCurlFileObject():
+@@ -1187,28 +1414,26 @@ class PyCurlFileObject():
          if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
              self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
              
@@ -983,7 +1003,7 @@ index e090e90..74a692c 100644
              
          # our url
          self.curl_obj.setopt(pycurl.URL, self.url)
-@@ -1228,39 +1452,36 @@ class PyCurlFileObject():
+@@ -1228,39 +1453,36 @@ class PyCurlFileObject():
              
              code = self.http_code
              errcode = e.args[0]
@@ -1032,7 +1052,7 @@ index e090e90..74a692c 100644
                  # this is probably wrong but ultimately this is what happens
                  # we have a legit http code and a pycurl 'writer failed' code
                  # which almost always means something aborted it from outside
-@@ -1272,33 +1493,94 @@ class PyCurlFileObject():
+@@ -1272,33 +1494,94 @@ class PyCurlFileObject():
              elif errcode == 58:
                  msg = _("problem with the local client certificate")
                  err = URLGrabError(14, msg)
@@ -1135,7 +1155,7 @@ index e090e90..74a692c 100644
  
      def _do_open(self):
          self.curl_obj = _curl_cache
-@@ -1333,7 +1615,11 @@ class PyCurlFileObject():
+@@ -1333,7 +1616,11 @@ class PyCurlFileObject():
                  
          if self.opts.range:
              rt = self.opts.range
@@ -1148,7 +1168,7 @@ index e090e90..74a692c 100644
  
          if rt:
              header = range_tuple_to_header(rt)
-@@ -1434,21 +1720,46 @@ class PyCurlFileObject():
+@@ -1434,21 +1721,46 @@ class PyCurlFileObject():
              #fh, self._temp_name = mkstemp()
              #self.fo = open(self._temp_name, 'wb')
  
@@ -1202,7 +1222,7 @@ index e090e90..74a692c 100644
          else:
              #self.fo = open(self._temp_name, 'r')
              self.fo.seek(0)
-@@ -1526,17 +1837,20 @@ class PyCurlFileObject():
+@@ -1526,17 +1838,20 @@ class PyCurlFileObject():
              if self._prog_running:
                  downloaded += self._reget_length
                  self.opts.progress_obj.update(downloaded)
@@ -1228,7 +1248,7 @@ index e090e90..74a692c 100644
  
              msg = _("Downloaded more than max size for %s: %s > %s") \
                          % (self.url, cur, max_size)
-@@ -1544,13 +1858,6 @@ class PyCurlFileObject():
+@@ -1544,13 +1859,6 @@ class PyCurlFileObject():
              return True
          return False
          
@@ -1242,7 +1262,7 @@ index e090e90..74a692c 100644
      def read(self, amt=None):
          self._fill_buffer(amt)
          if amt is None:
-@@ -1582,9 +1889,21 @@ class PyCurlFileObject():
+@@ -1582,9 +1890,21 @@ class PyCurlFileObject():
              self.opts.progress_obj.end(self._amount_read)
          self.fo.close()
          
@@ -1265,7 +1285,7 @@ index e090e90..74a692c 100644
  
  #####################################################################
  # DEPRECATED FUNCTIONS
-@@ -1621,6 +1940,467 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+@@ -1621,6 +1941,478 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
  
          
  #####################################################################
@@ -1498,17 +1518,23 @@ index e090e90..74a692c 100644
 +    host_con = {} # current host connection counts
 +
 +    def start(opts, tries):
++        opts.tries = tries
++        try:
++            dl.start(opts)
++        except OSError, e:
++            # can't spawn downloader, give up immediately
++            opts.exception = URLGrabError(5, exception2msg(e))
++            _run_callback(opts.failfunc, opts)
++            return
++
 +        key, limit = opts.async
 +        host_con[key] = host_con.get(key, 0) + 1
-+        opts.tries = tries
 +        if opts.progress_obj:
 +            if opts.multi_progress_obj:
 +                opts._progress = opts.multi_progress_obj.newMeter()
 +                opts._progress.start(text=opts.text)
 +            else:
 +                opts._progress = time.time() # no updates
-+        if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
-+        dl.start(opts)
 +
 +    def perform():
 +        for opts, size, ug_err in dl.perform():
@@ -1588,6 +1614,8 @@ index e090e90..74a692c 100644
 +            # check global limit
 +            while len(dl.running) >= default_grabber.opts.max_connections:
 +                perform()
++            if DEBUG:
++                DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections)
 +
 +            if opts.mirror_group:
 +                mg, errors, failed, removed = opts.mirror_group
@@ -1636,6 +1664,9 @@ index e090e90..74a692c 100644
 +            key, limit = opts.async
 +            while host_con.get(key, 0) >= limit:
 +                perform()
++            if DEBUG:
++                DEBUG.info('max_connections(%s): %d/%d', key, host_con.get(key, 0), limit)
++
 +            start(opts, 1)
 +    except IOError, e:
 +        if e.errno != 4: raise