[python-urlgrabber/f17] Update to latest head

Fri Jul 20 12:11:52 UTC 2012

commit 604dc61e1723fda3f860def1a353e82d0fd3fc38
Author: Zdeněk Pavlas <zpavlas at redhat.com>
Date:   Fri Jul 20 14:10:53 2012 +0200

    Update to latest head

 python-urlgrabber.spec |    8 ++-
 urlgrabber-HEAD.patch  |  176 +++++++++++++++++++++++++++++++-----------------
 2 files changed, 122 insertions(+), 62 deletions(-)
---

diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec
index f57e573..5fbf43f 100644
--- a/python-urlgrabber.spec
+++ b/python-urlgrabber.spec
@@ -3,7 +3,7 @@
 Summary: A high-level cross-protocol url-grabber
 Name: python-urlgrabber
 Version: 3.9.1
-Release: 13%{?dist}
+Release: 14%{?dist}
 Source0: urlgrabber-%{version}.tar.gz
 Patch1: urlgrabber-HEAD.patch
 
@@ -44,6 +44,12 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down
 
 %changelog
+* Fri Jul 20 2012 Zdenek Pavlas <zpavlas at redhat.com> - 3.9.1-14
+- update to latest HEAD
+- disable Kerberos proxy auth.  BZ 769254
+- fix copy_local issue. BZ 837018
+- send 'tries' counter to mirror failure callback
+
 * Thu Jun 14 2012 Zdenek Pavlas <zpavlas at redhat.com> - 3.9.1-13
 - update to latest HEAD
 - Start meters immediately, and only when asked to.  BZ 831904, 831291.
diff --git a/urlgrabber-HEAD.patch b/urlgrabber-HEAD.patch
index c77c6c5..4e1b34b 100644
--- a/urlgrabber-HEAD.patch
+++ b/urlgrabber-HEAD.patch
@@ -73,12 +73,29 @@ index 518e512..09cd896 100644
          print __doc__
 diff --git a/scripts/urlgrabber-ext-down b/scripts/urlgrabber-ext-down
 new file mode 100755
-index 0000000..670750c
+index 0000000..3da55a4
 --- /dev/null
 +++ b/scripts/urlgrabber-ext-down
-@@ -0,0 +1,55 @@
+@@ -0,0 +1,72 @@
 +#! /usr/bin/python
 +#  A very simple external downloader
++#  Copyright 2011-2012 Zdenek Pavlas
++
++#   This library is free software; you can redistribute it and/or
++#   modify it under the terms of the GNU Lesser General Public
++#   License as published by the Free Software Foundation; either
++#   version 2.1 of the License, or (at your option) any later version.
++#
++#   This library is distributed in the hope that it will be useful,
++#   but WITHOUT ANY WARRANTY; without even the implied warranty of
++#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++#   Lesser General Public License for more details.
++#
++#   You should have received a copy of the GNU Lesser General Public
++#   License along with this library; if not, write to the
++#      Free Software Foundation, Inc.,
++#      59 Temple Place, Suite 330,
++#      Boston, MA  02111-1307  USA
 +
 +import time, os, errno, sys
 +from urlgrabber.grabber import \
@@ -216,7 +233,7 @@ index 3e5f3b7..8eeaeda 100644
      return (fb,lb)
  
 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..071146c 100644
+index e090e90..83823ea 100644
 --- a/urlgrabber/grabber.py
 +++ b/urlgrabber/grabber.py
 @@ -49,7 +49,7 @@ GENERAL ARGUMENTS (kwargs)
@@ -558,7 +575,19 @@ index e090e90..071146c 100644
      """Provides easy opening of URLs with a variety of options.
      
      All options are specified as kwargs. Options may be specified when
-@@ -887,14 +1028,15 @@ class URLGrabber:
+@@ -872,7 +1013,6 @@ class URLGrabber:
+             # beware of infinite loops :)
+             tries = tries + 1
+             exception = None
+-            retrycode = None
+             callback  = None
+             if DEBUG: DEBUG.info('attempt %i/%s: %s',
+                                  tries, opts.retry, args[0])
+@@ -883,23 +1023,24 @@ class URLGrabber:
+             except URLGrabError, e:
+                 exception = e
+                 callback = opts.failure_callback
+-                retrycode = e.errno
              except KeyboardInterrupt, e:
                  exception = e
                  callback = opts.interrupt_callback
@@ -576,7 +605,13 @@ index e090e90..071146c 100644
  
              if (opts.retry is None) or (tries == opts.retry):
                  if DEBUG: DEBUG.info('retries exceeded, re-raising')
-@@ -912,9 +1054,11 @@ class URLGrabber:
+                 raise
+ 
++            retrycode = getattr(exception, 'errno', None)
+             if (retrycode is not None) and (retrycode not in opts.retrycodes):
+                 if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
+                                      retrycode, opts.retrycodes)
+@@ -912,9 +1053,11 @@ class URLGrabber:
          returned that supports them. The file object can be treated 
          like any other file object.
          """
@@ -588,7 +623,7 @@ index e090e90..071146c 100644
          def retryfunc(opts, url):
              return PyCurlFileObject(url, filename=None, opts=opts)
          return self._retry(opts, retryfunc, url)
-@@ -925,12 +1069,17 @@ class URLGrabber:
+@@ -925,12 +1068,17 @@ class URLGrabber:
          urlgrab returns the filename of the local file, which may be 
          different from the passed-in filename if copy_local == 0.
          """
@@ -606,7 +641,7 @@ index e090e90..071146c 100644
          if scheme == 'file' and not opts.copy_local:
              # just return the name of the local file - don't make a 
              # copy currently
-@@ -950,30 +1099,36 @@ class URLGrabber:
+@@ -950,30 +1098,36 @@ class URLGrabber:
  
              elif not opts.range:
                  if not opts.checkfunc is None:
@@ -656,7 +691,7 @@ index e090e90..071146c 100644
      
      def urlread(self, url, limit=None, **kwargs):
          """read the url into a string, up to 'limit' bytes
-@@ -982,9 +1137,11 @@ class URLGrabber:
+@@ -982,9 +1136,11 @@ class URLGrabber:
          "I want the first N bytes" but rather 'read the whole file 
          into memory, but don't use too much'
          """
@@ -668,7 +703,7 @@ index e090e90..071146c 100644
          if limit is not None:
              limit = limit + 1
              
-@@ -1000,12 +1157,8 @@ class URLGrabber:
+@@ -1000,12 +1156,8 @@ class URLGrabber:
                  else: s = fo.read(limit)
  
                  if not opts.checkfunc is None:
@@ -683,7 +718,7 @@ index e090e90..071146c 100644
              finally:
                  fo.close()
              return s
-@@ -1020,6 +1173,7 @@ class URLGrabber:
+@@ -1020,6 +1172,7 @@ class URLGrabber:
          return s
          
      def _make_callback(self, callback_obj):
@@ -691,7 +726,7 @@ index e090e90..071146c 100644
          if callable(callback_obj):
              return callback_obj, (), {}
          else:
-@@ -1030,7 +1184,7 @@ class URLGrabber:
+@@ -1030,7 +1183,7 @@ class URLGrabber:
  default_grabber = URLGrabber()
  
  
@@ -700,7 +735,7 @@ index e090e90..071146c 100644
      def __init__(self, url, filename, opts):
          self.fo = None
          self._hdr_dump = ''
-@@ -1052,10 +1206,11 @@ class PyCurlFileObject():
+@@ -1052,10 +1205,11 @@ class PyCurlFileObject():
          self._reget_length = 0
          self._prog_running = False
          self._error = (None, None)
@@ -714,7 +749,7 @@ index e090e90..071146c 100644
      def __getattr__(self, name):
          """This effectively allows us to wrap at the instance level.
          Any attribute not found in _this_ object will be searched for
-@@ -1085,9 +1240,14 @@ class PyCurlFileObject():
+@@ -1085,9 +1239,14 @@ class PyCurlFileObject():
              return -1
              
      def _hdr_retrieve(self, buf):
@@ -730,7 +765,7 @@ index e090e90..071146c 100644
          try:
              self._hdr_dump += buf
              # we have to get the size before we do the progress obj start
-@@ -1104,7 +1264,17 @@ class PyCurlFileObject():
+@@ -1104,7 +1263,17 @@ class PyCurlFileObject():
                      s = parse150(buf)
                  if s:
                      self.size = int(s)
@@ -749,7 +784,7 @@ index e090e90..071146c 100644
              return len(buf)
          except KeyboardInterrupt:
              return pycurl.READFUNC_ABORT
-@@ -1113,8 +1283,10 @@ class PyCurlFileObject():
+@@ -1113,8 +1282,10 @@ class PyCurlFileObject():
          if self._parsed_hdr:
              return self._parsed_hdr
          statusend = self._hdr_dump.find('\n')
@@ -760,7 +795,7 @@ index e090e90..071146c 100644
          self._parsed_hdr =  mimetools.Message(hdrfp)
          return self._parsed_hdr
      
-@@ -1127,6 +1299,9 @@ class PyCurlFileObject():
+@@ -1127,6 +1298,9 @@ class PyCurlFileObject():
          if not opts:
              opts = self.opts
  
@@ -770,7 +805,7 @@ index e090e90..071146c 100644
  
          # defaults we're always going to set
          self.curl_obj.setopt(pycurl.NOPROGRESS, False)
-@@ -1136,11 +1311,21 @@ class PyCurlFileObject():
+@@ -1136,11 +1310,21 @@ class PyCurlFileObject():
          self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
          self.curl_obj.setopt(pycurl.FAILONERROR, True)
          self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
@@ -792,7 +827,7 @@ index e090e90..071146c 100644
          
          # maybe to be options later
          self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
-@@ -1148,9 +1333,11 @@ class PyCurlFileObject():
+@@ -1148,9 +1332,11 @@ class PyCurlFileObject():
          
          # timeouts
          timeout = 300
@@ -807,7 +842,7 @@ index e090e90..071146c 100644
  
          # ssl options
          if self.scheme == 'https':
-@@ -1158,13 +1345,16 @@ class PyCurlFileObject():
+@@ -1158,13 +1344,16 @@ class PyCurlFileObject():
                  self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
                  self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
              self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
@@ -825,7 +860,7 @@ index e090e90..071146c 100644
              if opts.ssl_cert_type:                
                  self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
              if opts.ssl_key_pass:
-@@ -1187,28 +1377,24 @@ class PyCurlFileObject():
+@@ -1187,28 +1376,26 @@ class PyCurlFileObject():
          if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
              self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
              
@@ -849,7 +884,9 @@ index e090e90..071146c 100644
 +        # proxy
 +        if opts.proxy is not None:
 +            self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
-+            self.curl_obj.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY)
++            self.curl_obj.setopt(pycurl.PROXYAUTH,
++                # All but Kerberos.  BZ 769254
++                pycurl.HTTPAUTH_ANY - pycurl.HTTPAUTH_GSSNEGOTIATE)
 +
 +        if opts.username and opts.password:
 +            if self.scheme in ('http', 'https'):
@@ -868,7 +905,7 @@ index e090e90..071146c 100644
              
          # our url
          self.curl_obj.setopt(pycurl.URL, self.url)
-@@ -1228,12 +1414,14 @@ class PyCurlFileObject():
+@@ -1228,12 +1415,14 @@ class PyCurlFileObject():
              
              code = self.http_code
              errcode = e.args[0]
@@ -885,7 +922,7 @@ index e090e90..071146c 100644
                  
                  # this is probably wrong but ultimately this is what happens
                  # we have a legit http code and a pycurl 'writer failed' code
-@@ -1244,23 +1432,23 @@ class PyCurlFileObject():
+@@ -1244,23 +1433,23 @@ class PyCurlFileObject():
                  raise KeyboardInterrupt
              
              elif errcode == 28:
@@ -916,7 +953,7 @@ index e090e90..071146c 100644
                  # this is probably wrong but ultimately this is what happens
                  # we have a legit http code and a pycurl 'writer failed' code
                  # which almost always means something aborted it from outside
-@@ -1272,33 +1460,94 @@ class PyCurlFileObject():
+@@ -1272,33 +1461,94 @@ class PyCurlFileObject():
              elif errcode == 58:
                  msg = _("problem with the local client certificate")
                  err = URLGrabError(14, msg)
@@ -1018,7 +1055,7 @@ index e090e90..071146c 100644
  
      def _do_open(self):
          self.curl_obj = _curl_cache
-@@ -1333,7 +1582,11 @@ class PyCurlFileObject():
+@@ -1333,7 +1583,11 @@ class PyCurlFileObject():
                  
          if self.opts.range:
              rt = self.opts.range
@@ -1031,7 +1068,7 @@ index e090e90..071146c 100644
  
          if rt:
              header = range_tuple_to_header(rt)
-@@ -1434,21 +1687,46 @@ class PyCurlFileObject():
+@@ -1434,21 +1688,46 @@ class PyCurlFileObject():
              #fh, self._temp_name = mkstemp()
              #self.fo = open(self._temp_name, 'wb')
  
@@ -1085,7 +1122,7 @@ index e090e90..071146c 100644
          else:
              #self.fo = open(self._temp_name, 'r')
              self.fo.seek(0)
-@@ -1526,17 +1804,20 @@ class PyCurlFileObject():
+@@ -1526,17 +1805,20 @@ class PyCurlFileObject():
              if self._prog_running:
                  downloaded += self._reget_length
                  self.opts.progress_obj.update(downloaded)
@@ -1111,7 +1148,7 @@ index e090e90..071146c 100644
  
              msg = _("Downloaded more than max size for %s: %s > %s") \
                          % (self.url, cur, max_size)
-@@ -1544,13 +1825,6 @@ class PyCurlFileObject():
+@@ -1544,13 +1826,6 @@ class PyCurlFileObject():
              return True
          return False
          
@@ -1125,7 +1162,7 @@ index e090e90..071146c 100644
      def read(self, amt=None):
          self._fill_buffer(amt)
          if amt is None:
-@@ -1582,9 +1856,21 @@ class PyCurlFileObject():
+@@ -1582,9 +1857,21 @@ class PyCurlFileObject():
              self.opts.progress_obj.end(self._amount_read)
          self.fo.close()
          
@@ -1148,7 +1185,7 @@ index e090e90..071146c 100644
  
  #####################################################################
  # DEPRECATED FUNCTIONS
-@@ -1621,6 +1907,433 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+@@ -1621,6 +1908,442 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
  
          
  #####################################################################
@@ -1415,18 +1452,23 @@ index e090e90..071146c 100644
 +                try: _run_callback(opts.failure_callback, opts)
 +                except URLGrabError, ug_err:
 +                    retry = 0 # no retries
-+            if opts.tries < retry and ug_err.args[0] in opts.retrycodes:
++            if opts.tries < retry and ug_err.errno in opts.retrycodes:
 +                start(opts, opts.tries + 1) # simple retry
 +                continue
 +
 +            if opts.mirror_group:
-+                mg, failed = opts.mirror_group
++                mg, failed, removed = opts.mirror_group
++                failed[key] = failed.get(key, 0) + 1
 +                opts.mirror = key
 +                opts.exception = ug_err
-+                action = _run_callback(mg.failure_callback, opts)
-+                if not (action and action.get('fail')):
++                action = mg.default_action or {}
++                if mg.failure_callback:
++                    opts.tries = sum(failed.values())
++                    action.update(_run_callback(mg.failure_callback, opts))
++                if not action.get('fail', 0):
 +                    # mask this mirror and retry
-+                    failed.add(key)
++                    if action.get('remove', 1):
++                        removed.add(key)
 +                    _async_queue.append(opts)
 +                    continue
 +
@@ -1453,17 +1495,21 @@ index e090e90..071146c 100644
 +                perform()
 +
 +            if opts.mirror_group:
-+                mg, failed = opts.mirror_group
++                mg, failed, removed = opts.mirror_group
 +
 +                # find the best mirror
 +                best = None
++                best_speed = None
 +                for mirror in mg.mirrors:
 +                    key = mirror['mirror']
-+                    if key in failed: continue
++                    if key in removed: continue
 +
 +                    # estimate mirror speed
 +                    speed = _TH.estimate(key)
 +                    speed /= 1 + host_con.get(key, 0)
++
++                    # 2-tuple to select mirror with least failures
++                    speed = -failed.get(key, 0), speed
 +                    if best is None or speed > best_speed:
 +                        best = mirror
 +                        best_speed = speed
@@ -1583,7 +1629,7 @@ index e090e90..071146c 100644
  def _main_test():
      try: url, filename = sys.argv[1:3]
 diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
-index dad410b..d699b61 100644
+index dad410b..ac78b34 100644
 --- a/urlgrabber/mirror.py
 +++ b/urlgrabber/mirror.py
 @@ -76,6 +76,9 @@ CUSTOMIZATION
@@ -1602,20 +1648,19 @@ index dad410b..d699b61 100644
  
 -from grabber import URLGrabError, CallbackObject, DEBUG
 +from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
-+from grabber import _run_callback, _do_raise, _async_queue
++from grabber import _run_callback, _do_raise
  
  def _(st): 
      return st
-@@ -254,7 +258,7 @@ class MirrorGroup:
-     # if these values are found in **kwargs passed to one of the urlXXX
-     # methods, they will be stripped before getting passed on to the
-     # grabber
--    options = ['default_action', 'failure_callback']
-+    options = ['default_action', 'failure_callback', 'failfunc']
-     
-     def _process_kwargs(self, kwargs):
-         self.failure_callback = kwargs.get('failure_callback')
-@@ -263,7 +267,8 @@ class MirrorGroup:
+@@ -184,6 +188,7 @@ class MirrorGroup:
+ 
+            obj.exception    = < exception that was raised >
+            obj.mirror       = < the mirror that was tried >
++           obj.tries        = < the number of mirror tries so far >
+            obj.relative_url = < url relative to the mirror >
+            obj.url          = < full url that failed >
+                               # .url is just the combination of .mirror
+@@ -263,7 +268,8 @@ class MirrorGroup:
      def _parse_mirrors(self, mirrors):
          parsed_mirrors = []
          for m in mirrors:
@@ -1625,23 +1670,32 @@ index dad410b..d699b61 100644
              parsed_mirrors.append(m)
          return parsed_mirrors
      
-@@ -402,10 +407,25 @@ class MirrorGroup:
+@@ -382,7 +388,9 @@ class MirrorGroup:
+             try: del kw[k]
+             except KeyError: pass
+ 
++        tries = 0
+         while 1:
++            tries += 1
+             mirrorchoice = self._get_mirror(gr)
+             fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
+             kwargs = dict(mirrorchoice.get('kwargs', {}))
+@@ -399,13 +407,24 @@ class MirrorGroup:
+                 obj.mirror = mirrorchoice['mirror']
+                 obj.relative_url = gr.url
+                 obj.url = fullurl
++                obj.tries = tries
                  self._failure(gr, obj)
  
      def urlgrab(self, url, filename=None, **kwargs):
-+        if kwargs.get('async'):
-+            opts = self.grabber.opts.derive(**kwargs)
-+            opts.mirror_group = self, set()
-+            opts.relative_url = _to_utf8(url)
-+
-+            opts.url = 'http://tbd'
-+            opts.filename = filename
-+            opts.size = int(opts.size or 0)
-+            _async_queue.append(opts)
-+            return filename
-+
          kw = dict(kwargs)
          kw['filename'] = filename
++        if kw.get('async'):
++            # enable mirror failovers in async path
++            kw['mirror_group'] = self, {}, set()
++            kw['relative_url'] = url
++        else:
++            kw.pop('failfunc', None)
          func = 'urlgrab'
 -        return self._mirror_try(func, url, kw)
 +        try: