[python-urlgrabber] Update to latest HEAD

Thu Nov 1 13:13:52 UTC 2012

commit add16b8996c1e55801346bc6b216a9c3fc8337c3
Author: Zdeněk Pavlas <zpavlas at redhat.com>
Date:   Thu Nov 1 14:13:37 2012 +0100

    Update to latest HEAD

 python-urlgrabber.spec |    6 ++-
 urlgrabber-HEAD.patch  |  151 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 101 insertions(+), 56 deletions(-)
---

diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec
index 2975737..d046d31 100644
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.1
-Release: 20%{?dist}
+Release: 21%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch1: urlgrabber-HEAD.patch
 
@@ -44,6 +44,10 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down
 
 %changelog
+* Thu Nov  1 2012 Zdeněk Pavlas <zpavlas at redhat.com> - 3.9.1-21
+- Update to latest HEAD.
+- Get rid of "HTTP 200 OK" errors.  BZ 871835.
+
 * Tue Sep  4 2012 Zdeněk Pavlas <zpavlas at redhat.com> - 3.9.1-20
 - Update to latest HEAD.
 - Fixed BZ 851178, 854075.
diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch
index ef304ad..55c3ba4 100644
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@@ -236,7 +236,7 @@ index 3e5f3b7..8eeaeda 100644
      return (fb,lb)
  
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..01218b0 100644
+index e090e90..74a692c 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
 @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
@@ -458,7 +458,7 @@ index e090e90..01218b0 100644
  ########################################################################
  # functions for debugging output.  These functions are here because they
  # are also part of the module initialization.
-@@ -527,6 +608,22 @@ def _(st):
+@@ -527,6 +608,29 @@ def _(st):
  #                 END MODULE INITIALIZATION
  ########################################################################
  
@@ -475,13 +475,20 @@ index e090e90..01218b0 100644
 +        obj = obj.encode('utf-8', errors)
 +    return obj
 +
++def exception2msg(e):
++    try:
++        return str(e)
++    except UnicodeEncodeError:
++        # always use byte strings
++        return unicode(e).encode('utf8')
++
 +########################################################################
 +#                 END UTILITY FUNCTIONS
 +########################################################################
  
  
  class URLGrabError(IOError):
-@@ -662,6 +759,7 @@ class URLParser:
+@@ -662,6 +766,7 @@ class URLParser:
            opts.quote = 0     --> do not quote it
            opts.quote = None  --> guess
          """
@@ -489,7 +496,7 @@ index e090e90..01218b0 100644
          quote = opts.quote
          
          if opts.prefix:
-@@ -768,6 +866,41 @@ class URLGrabberOptions:
+@@ -768,6 +873,41 @@ class URLGrabberOptions:
          else: # throttle is a float
              return self.bandwidth * self.throttle
          
@@ -531,7 +538,7 @@ index e090e90..01218b0 100644
      def derive(self, **kwargs):
          """Create a derived URLGrabberOptions instance.
          This method creates a new instance and overrides the
-@@ -791,30 +924,37 @@ class URLGrabberOptions:
+@@ -791,30 +931,37 @@ class URLGrabberOptions:
          provided here.
          """
          self.progress_obj = None
@@ -570,7 +577,7 @@ index e090e90..01218b0 100644
          self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
          self.ssl_context = None # no-op in pycurl
          self.ssl_verify_peer = True # check peer's cert for authenticityb
-@@ -827,6 +967,12 @@ class URLGrabberOptions:
+@@ -827,6 +974,12 @@ class URLGrabberOptions:
          self.size = None # if we know how big the thing we're getting is going
                           # to be. this is ultimately a MAXIMUM size for the file
          self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
@@ -583,7 +590,7 @@ index e090e90..01218b0 100644
          
      def __repr__(self):
          return self.format()
-@@ -846,7 +992,18 @@ class URLGrabberOptions:
+@@ -846,7 +999,18 @@ class URLGrabberOptions:
          s = s + indent + '}'
          return s
  
@@ -603,7 +610,7 @@ index e090e90..01218b0 100644
      """Provides easy opening of URLs with a variety of options.
      
      All options are specified as kwargs. Options may be specified when
-@@ -872,7 +1029,6 @@ class URLGrabber:
+@@ -872,7 +1036,6 @@ class URLGrabber:
              # beware of infinite loops :)
              tries = tries + 1
              exception = None
@@ -611,7 +618,7 @@ index e090e90..01218b0 100644
              callback  = None
              if DEBUG: DEBUG.info('attempt %i/%s: %s',
                                   tries, opts.retry, args[0])
-@@ -883,54 +1039,62 @@ class URLGrabber:
+@@ -883,54 +1046,62 @@ class URLGrabber:
              except URLGrabError, e:
                  exception = e
                  callback = opts.failure_callback
@@ -681,7 +688,7 @@ index e090e90..01218b0 100644
          if scheme == 'file' and not opts.copy_local:
              # just return the name of the local file - don't make a 
              # copy currently
-@@ -950,41 +1114,51 @@ class URLGrabber:
+@@ -950,41 +1121,51 @@ class URLGrabber:
  
              elif not opts.range:
                  if not opts.checkfunc is None:
@@ -748,7 +755,7 @@ index e090e90..01218b0 100644
          if limit is not None:
              limit = limit + 1
              
-@@ -1000,12 +1174,8 @@ class URLGrabber:
+@@ -1000,12 +1181,8 @@ class URLGrabber:
                  else: s = fo.read(limit)
  
                  if not opts.checkfunc is None:
@@ -763,7 +770,7 @@ index e090e90..01218b0 100644
              finally:
                  fo.close()
              return s
-@@ -1020,6 +1190,7 @@ class URLGrabber:
+@@ -1020,6 +1197,7 @@ class URLGrabber:
          return s
          
      def _make_callback(self, callback_obj):
@@ -771,7 +778,7 @@ index e090e90..01218b0 100644
          if callable(callback_obj):
              return callback_obj, (), {}
          else:
-@@ -1030,7 +1201,7 @@ class URLGrabber:
+@@ -1030,7 +1208,7 @@ class URLGrabber:
  default_grabber = URLGrabber()
  
  
@@ -780,7 +787,7 @@ index e090e90..01218b0 100644
      def __init__(self, url, filename, opts):
          self.fo = None
          self._hdr_dump = ''
-@@ -1052,10 +1223,13 @@ class PyCurlFileObject():
+@@ -1052,10 +1230,13 @@ class PyCurlFileObject():
          self._reget_length = 0
          self._prog_running = False
          self._error = (None, None)
@@ -796,7 +803,7 @@ index e090e90..01218b0 100644
      def __getattr__(self, name):
          """This effectively allows us to wrap at the instance level.
          Any attribute not found in _this_ object will be searched for
-@@ -1067,6 +1241,12 @@ class PyCurlFileObject():
+@@ -1067,6 +1248,12 @@ class PyCurlFileObject():
  
      def _retrieve(self, buf):
          try:
@@ -809,7 +816,18 @@ index e090e90..01218b0 100644
              if not self._prog_running:
                  if self.opts.progress_obj:
                      size  = self.size + self._reget_length
-@@ -1085,9 +1265,14 @@ class PyCurlFileObject():
+@@ -1079,15 +1266,24 @@ class PyCurlFileObject():
+                     self.opts.progress_obj.update(self._amount_read)
+ 
+             self._amount_read += len(buf)
+-            self.fo.write(buf)
++            try:
++                self.fo.write(buf)
++            except IOError, e:
++                self._cb_error = URLGrabError(16, exception2msg(e))
++                return -1
+             return len(buf)
+         except KeyboardInterrupt:
              return -1
              
      def _hdr_retrieve(self, buf):
@@ -825,7 +843,7 @@ index e090e90..01218b0 100644
          try:
              self._hdr_dump += buf
              # we have to get the size before we do the progress obj start
-@@ -1104,7 +1289,17 @@ class PyCurlFileObject():
+@@ -1104,7 +1300,17 @@ class PyCurlFileObject():
                      s = parse150(buf)
                  if s:
                      self.size = int(s)
@@ -844,7 +862,7 @@ index e090e90..01218b0 100644
              return len(buf)
          except KeyboardInterrupt:
              return pycurl.READFUNC_ABORT
-@@ -1113,8 +1308,10 @@ class PyCurlFileObject():
+@@ -1113,8 +1319,10 @@ class PyCurlFileObject():
          if self._parsed_hdr:
              return self._parsed_hdr
          statusend = self._hdr_dump.find('\n')
@@ -855,7 +873,7 @@ index e090e90..01218b0 100644
          self._parsed_hdr =  mimetools.Message(hdrfp)
          return self._parsed_hdr
      
-@@ -1127,6 +1324,9 @@ class PyCurlFileObject():
+@@ -1127,6 +1335,9 @@ class PyCurlFileObject():
          if not opts:
              opts = self.opts
  
@@ -865,7 +883,7 @@ index e090e90..01218b0 100644
  
          # defaults we're always going to set
          self.curl_obj.setopt(pycurl.NOPROGRESS, False)
-@@ -1136,11 +1336,21 @@ class PyCurlFileObject():
+@@ -1136,11 +1347,21 @@ class PyCurlFileObject():
          self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
          self.curl_obj.setopt(pycurl.FAILONERROR, True)
          self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@@ -887,7 +905,7 @@ index e090e90..01218b0 100644
          
          # maybe to be options later
          self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
-@@ -1148,9 +1358,11 @@ class PyCurlFileObject():
+@@ -1148,9 +1369,11 @@ class PyCurlFileObject():
          
          # timeouts
          timeout = 300
@@ -897,12 +915,12 @@ index e090e90..01218b0 100644
 +        if hasattr(opts, 'timeout'):
 +            timeout = int(opts.timeout or 0)
 +        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
-+        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
++        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1000)
 +        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
  
          # ssl options
          if self.scheme == 'https':
-@@ -1158,13 +1370,16 @@ class PyCurlFileObject():
+@@ -1158,13 +1381,16 @@ class PyCurlFileObject():
                  self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
                  self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
              self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
@@ -920,7 +938,7 @@ index e090e90..01218b0 100644
              if opts.ssl_cert_type:                
                  self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
              if opts.ssl_key_pass:
-@@ -1187,28 +1402,26 @@ class PyCurlFileObject():
+@@ -1187,28 +1413,26 @@ class PyCurlFileObject():
          if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
              self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
              
@@ -965,7 +983,7 @@ index e090e90..01218b0 100644
              
          # our url
          self.curl_obj.setopt(pycurl.URL, self.url)
-@@ -1228,12 +1441,14 @@ class PyCurlFileObject():
+@@ -1228,39 +1452,36 @@ class PyCurlFileObject():
              
              code = self.http_code
              errcode = e.args[0]
@@ -974,16 +992,19 @@ index e090e90..01218b0 100644
              if self._error[0]:
                  errcode = self._error[0]
                  
-             if errcode == 23 and code >= 200 and code < 299:
+-            if errcode == 23 and code >= 200 and code < 299:
 -                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
 -                err.url = self.url
-+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
-+                err.url = errurl
-                 
+-                
++            if errcode == 23 and 200 <= code <= 299:
                  # this is probably wrong but ultimately this is what happens
                  # we have a legit http code and a pycurl 'writer failed' code
-@@ -1244,23 +1459,23 @@ class PyCurlFileObject():
-                 raise KeyboardInterrupt
+                 # which almost always means something aborted it from outside
+                 # since we cannot know what it is -I'm banking on it being
+                 # a ctrl-c. XXXX - if there's a way of going back two raises to 
+                 # figure out what aborted the pycurl process FIXME
+-                raise KeyboardInterrupt
++                raise getattr(self, '_cb_error', KeyboardInterrupt)
              
              elif errcode == 28:
 -                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
@@ -1008,12 +1029,10 @@ index e090e90..01218b0 100644
              elif errcode == 42:
 -                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
 -                err.url = self.url
-+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
-+                err.url = errurl
                  # this is probably wrong but ultimately this is what happens
                  # we have a legit http code and a pycurl 'writer failed' code
                  # which almost always means something aborted it from outside
-@@ -1272,33 +1487,94 @@ class PyCurlFileObject():
+@@ -1272,33 +1493,94 @@ class PyCurlFileObject():
              elif errcode == 58:
                  msg = _("problem with the local client certificate")
                  err = URLGrabError(14, msg)
@@ -1040,8 +1059,9 @@ index e090e90..01218b0 100644
 +                err.url = errurl
                  raise err
                      
-             elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
+-            elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
 -                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
++            elif str(e.args[1]) == '' and code and not 200 <= code <= 299:
 +                if self.scheme in ['http', 'https']:
 +                    if self.http_code in responses:
 +                        resp = responses[self.http_code]
@@ -1115,7 +1135,7 @@ index e090e90..01218b0 100644
  
      def _do_open(self):
          self.curl_obj = _curl_cache
-@@ -1333,7 +1609,11 @@ class PyCurlFileObject():
+@@ -1333,7 +1615,11 @@ class PyCurlFileObject():
                  
          if self.opts.range:
              rt = self.opts.range
@@ -1128,7 +1148,7 @@ index e090e90..01218b0 100644
  
          if rt:
              header = range_tuple_to_header(rt)
-@@ -1434,21 +1714,46 @@ class PyCurlFileObject():
+@@ -1434,21 +1720,46 @@ class PyCurlFileObject():
              #fh, self._temp_name = mkstemp()
              #self.fo = open(self._temp_name, 'wb')
  
@@ -1182,7 +1202,7 @@ index e090e90..01218b0 100644
          else:
              #self.fo = open(self._temp_name, 'r')
              self.fo.seek(0)
-@@ -1526,17 +1831,20 @@ class PyCurlFileObject():
+@@ -1526,17 +1837,20 @@ class PyCurlFileObject():
              if self._prog_running:
                  downloaded += self._reget_length
                  self.opts.progress_obj.update(downloaded)
@@ -1208,7 +1228,7 @@ index e090e90..01218b0 100644
  
              msg = _("Downloaded more than max size for %s: %s > %s") \
                          % (self.url, cur, max_size)
-@@ -1544,13 +1852,6 @@ class PyCurlFileObject():
+@@ -1544,13 +1858,6 @@ class PyCurlFileObject():
              return True
          return False
          
@@ -1222,7 +1242,7 @@ index e090e90..01218b0 100644
      def read(self, amt=None):
          self._fill_buffer(amt)
          if amt is None:
-@@ -1582,9 +1883,21 @@ class PyCurlFileObject():
+@@ -1582,9 +1889,21 @@ class PyCurlFileObject():
              self.opts.progress_obj.end(self._amount_read)
          self.fo.close()
          
@@ -1245,7 +1265,7 @@ index e090e90..01218b0 100644
  
  #####################################################################
  # DEPRECATED FUNCTIONS
-@@ -1621,6 +1934,466 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+@@ -1621,6 +1940,467 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
  
          
  #####################################################################
@@ -1580,11 +1600,12 @@ index e090e90..01218b0 100644
 +                    if key in removed: continue
 +
 +                    # estimate mirror speed
-+                    speed = _TH.estimate(key)
++                    speed, fail = _TH.estimate(key)
 +                    speed /= 1 + host_con.get(key, 0)
 +
 +                    # order by: least failures, private flag, best speed
-+                    private = mirror.get('kwargs', {}).get('private', False)
++                    # ignore 'private' flag if there were failures
++                    private = not fail and mirror.get('kwargs', {}).get('private', False)
 +                    speed = -failed.get(key, 0), private, speed
 +                    if best is None or speed > best_speed:
 +                        best = mirror
@@ -1701,19 +1722,19 @@ index e090e90..01218b0 100644
 +
 +        default_speed = default_grabber.opts.default_speed
 +        try: speed, fail, ts = _TH.hosts[host]
-+        except KeyError: return default_speed
++        except KeyError: return default_speed, 0
 +
 +        speed *= 2**-fail
 +        k = 2**((ts - time.time()) / default_grabber.opts.half_life)
 +        speed = k * speed + (1 - k) * default_speed
-+        return speed
++        return speed, fail
 +
 +#####################################################################
  #  TESTING
  def _main_test():
      try: url, filename = sys.argv[1:3]
 diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
-index dad410b..b17be17 100644
+index dad410b..7975f1b 100644
 --- a/urlgrabber/mirror.py
 +++ b/urlgrabber/mirror.py
 @@ -76,6 +76,9 @@ CUSTOMIZATION
@@ -1726,7 +1747,7 @@ index dad410b..b17be17 100644
      3) Pass keyword arguments when instantiating the mirror group.
         See, for example, the failure_callback argument.
  
-@@ -87,10 +90,12 @@ CUSTOMIZATION
+@@ -87,10 +90,14 @@ CUSTOMIZATION
  """
  
  
@@ -1737,10 +1758,12 @@ index dad410b..b17be17 100644
 -from grabber import URLGrabError, CallbackObject, DEBUG
 +from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
 +from grabber import _run_callback, _do_raise
++from grabber import exception2msg
++from grabber import _TH
  
  def _(st): 
      return st
-@@ -126,7 +131,9 @@ class MirrorGroup:
+@@ -126,7 +133,9 @@ class MirrorGroup:
          files)
  
        * if the local list is ever exhausted, a URLGrabError will be
@@ -1751,7 +1774,7 @@ index dad410b..b17be17 100644
  
      OPTIONS
  
-@@ -153,7 +160,8 @@ class MirrorGroup:
+@@ -153,7 +162,8 @@ class MirrorGroup:
  
          The 'fail' option will cause immediate failure by re-raising
          the exception and no further attempts to get the current
@@ -1761,7 +1784,7 @@ index dad410b..b17be17 100644
  
          This dict can be set at instantiation time,
            mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
-@@ -184,6 +192,7 @@ class MirrorGroup:
+@@ -184,6 +194,7 @@ class MirrorGroup:
  
             obj.exception    = < exception that was raised >
             obj.mirror       = < the mirror that was tried >
@@ -1769,7 +1792,25 @@ index dad410b..b17be17 100644
             obj.relative_url = < url relative to the mirror >
             obj.url          = < full url that failed >
                                # .url is just the combination of .mirror
-@@ -263,7 +272,8 @@ class MirrorGroup:
+@@ -251,6 +262,17 @@ class MirrorGroup:
+         self.default_action = None
+         self._process_kwargs(kwargs)
+ 
++        # use the same algorithm as parallel downloader to initially sort
++        # the mirror list (sort by speed, but prefer live private mirrors)
++        def estimate(m):
++            speed, fail = _TH.estimate(m['mirror'])
++            private = not fail and m.get('kwargs', {}).get('private', False)
++            return private, speed
++
++        # update the initial order.  since sorting is stable, the relative
++        # order of unknown (not used yet) hosts is retained.
++        self.mirrors.sort(key=estimate, reverse=True)
++
+     # if these values are found in **kwargs passed to one of the urlXXX
+     # methods, they will be stripped before getting passed on to the
+     # grabber
+@@ -263,7 +285,8 @@ class MirrorGroup:
      def _parse_mirrors(self, mirrors):
          parsed_mirrors = []
          for m in mirrors:
@@ -1779,7 +1820,7 @@ index dad410b..b17be17 100644
              parsed_mirrors.append(m)
          return parsed_mirrors
      
-@@ -280,7 +290,9 @@ class MirrorGroup:
+@@ -280,7 +303,9 @@ class MirrorGroup:
          #   return a random mirror so that multiple mirrors get used
          #   even without failures.
          if not gr.mirrors:
@@ -1790,7 +1831,7 @@ index dad410b..b17be17 100644
          return gr.mirrors[gr._next]
  
      def _failure(self, gr, cb_obj):
-@@ -307,7 +319,9 @@ class MirrorGroup:
+@@ -307,7 +332,9 @@ class MirrorGroup:
          a.update(action)
          action = a
          self.increment_mirror(gr, action)
@@ -1801,7 +1842,7 @@ index dad410b..b17be17 100644
  
      def increment_mirror(self, gr, action={}):
          """Tell the mirror object increment the mirror index
-@@ -377,35 +391,50 @@ class MirrorGroup:
+@@ -377,35 +404,50 @@ class MirrorGroup:
          gr.url  = url
          gr.kw   = dict(kw)
          self._load_gr(gr)
@@ -1828,7 +1869,7 @@ index dad410b..b17be17 100644
 +                return func_ref( *(fullurl,), opts=opts, **kw )
              except URLGrabError, e:
                  if DEBUG: DEBUG.info('MIRROR: failed')
-+                gr.errors.append((fullurl, str(e)))
++                gr.errors.append((fullurl, exception2msg(e)))
                  obj = CallbackObject()
                  obj.exception = e
                  obj.mirror = mirrorchoice['mirror']