[epylog: 290/297] add rsyncd module - epylog - Fedora mailing-lists

23 Jan 2012

commit ac1d87a876ae1fc1b87e2935c594a8f2d765becb
Author: Seth Vidal skvidal@fedoraproject.org
Date:   Wed Aug 24 17:04:39 2011 -0400
add rsyncd module
etc/modules.d/rsyncd.conf.in |   15 +++
 modules/rsyncd_mod.py        |  219 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 234 insertions(+), 0 deletions(-)
---

diff --git a/etc/modules.d/rsyncd.conf.in b/etc/modules.d/rsyncd.conf.in
new file mode 100644
index 0000000..a271f13
--- /dev/null
+++ b/etc/modules.d/rsyncd.conf.in
@@ -0,0 +1,15 @@
+[module]
+desc = Rsyncd
+exec = %%MODULES_DIR%%/rsyncd_mod.py
+files = /var/log/messages[.#]
+enabled = no
+internal = yes
+outhtml = yes
+priority = 7
+
+[conf]
+##
+# Report this many "top ranking hosts"
+#
+report_top = 10
+# ignore_hosts = list of dns-resolved host names you want to ignore rsyncs on
diff --git a/modules/rsyncd_mod.py b/modules/rsyncd_mod.py
new file mode 100644
index 0000000..5ec899c
--- /dev/null
+++ b/modules/rsyncd_mod.py
@@ -0,0 +1,219 @@
+#!/usr/bin/python -tt
+"""
+Rsyncd log parsing module for Epylog
+"""
+
+##
+# Copyright (C) 2003 by Duke University
+# Written by Seth Vidal <skvidal at phy.duke.edu>
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+#
+
+
+import sys
+import re
+
+##
+# This is for testing purposes, so you can invoke this from the
+# modules directory. See also the testing notes at the end of the
+# file.
+#
+sys.path.insert(0, '../py/')
+from epylog import Result, InternalModule
+
+class rsyncd_mod(InternalModule):
+    ##
+    # opts: is a map with extra options set in
+    #       [conf] section of the module config, or on the
+    #       command line using -o flag to the module.
+    # logger: A logging object. API:
+    #         logger.put(loglvl, 'Message')
+    #         Only critical stuff needs to go onto lvl 0.
+    #         Common output goes to lvl 1.
+    #         Others are debug levels.
+    #
+    def __init__(self, opts, logger):
+        ##
+        # Do a "super-init" so the class we are subclassing gets
+        # instantiated.
+        #
+        InternalModule.__init__(self)
+        self.logger = logger
+        ##
+        # Convenience
+        #
+        rc = re.compile
+
+        self.regex_map = {
+            rc('rsyncd[\d+]: rsync on'): self.rsync_hosts,
+            rc('rsyncd[\d+]: (?:sent|wrote)\s\S*\sbytes'): self.rsync_results
+        }
+        self.topcount = int(opts.get('report_top', 5)) #get report_top, default to 5 if not set
+        ig_s = opts.get('ignore_hosts', '')
+        ig_s.replace(',',' ')
+        self.ignore_hosts = ig_s.split(' ')
+        # dict to store all of our data
+        self.rsync_pid_bytes = {}
+        self.rsync_pid_host = {}
+        self.rsync_host_loc = rc('rsyncd[(\d+)]: rsync\son\s(\S*)\sfrom\s.*((\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}))')
+        self.rsync_bytes = rc('rsyncd[(\d+)]: (?:sent|wrote)\s(\d+) bytes  (?:read|received)\s(\d+) bytes  total size (\d+)')
+
+    def rsync_hosts(self, linemap):
+        (sys, msg, multi) = self.get_smm(linemap)
+        pid, loc, ip = self.rsync_host_loc.search(msg).groups()
+        host = self.gethost(ip)
+        if host not in self.ignore_hosts:
+            self.rsync_pid_host[pid] = (host, loc)
+        return {(loc, host): 1}
+
+    def rsync_results(self, linemap):
+        (sys, msg, multi) = self.get_smm(linemap)
+        pid, wbytes, rbytes, tbytes = self.rsync_bytes.search(msg).groups()
+        self.rsync_pid_bytes[pid] = (wbytes, rbytes, tbytes)
+        return {(pid, wbytes): 1}
+
+    def _uniq(self, s):
+        """Return a list of the elements in s, but without duplicates.
+    
+        For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3],
+        unique("abcabc") some permutation of ["a", "b", "c"], and
+        unique(([1, 2], [2, 3], [1, 2])) some permutation of
+        [[2, 3], [1, 2]].
+    
+        For best speed, all sequence elements should be hashable.  Then
+        unique() will usually work in linear time.
+    
+        If not possible, the sequence elements should enjoy a total
+        ordering, and if list(s).sort() doesn't raise TypeError it's
+        assumed that they do enjoy a total ordering.  Then unique() will
+        usually work in O(N*log2(N)) time.
+    
+        If that's not possible either, the sequence elements must support
+        equality-testing.  Then unique() will usually work in quadratic
+        time.
+        """
+    
+        n = len(s)
+        if n == 0:
+            return []
+    
+        # Try using a dict first, as that's the fastest and will usually
+        # work.  If it doesn't work, it will usually fail quickly, so it
+        # usually doesn't cost much to *try* it.  It requires that all the
+        # sequence elements be hashable, and support equality comparison.
+        u = {}
+        try:
+            for x in s:
+                u[x] = 1
+        except TypeError:
+            del u  # move on to the next method
+        else:
+            return u.keys()
+    
+        # We can't hash all the elements.  Second fastest is to sort,
+        # which brings the equal elements together; then duplicates are
+        # easy to weed out in a single pass.
+        # NOTE:  Python's list.sort() was designed to be efficient in the
+        # presence of many duplicate elements.  This isn't true of all
+        # sort functions in all languages or libraries, so this approach
+        # is more effective in Python than it may be elsewhere.
+        try:
+            t = list(s)
+            t.sort()
+        except TypeError:
+            del t  # move on to the next method
+        else:
+            assert n > 0
+            last = t[0]
+            lasti = i = 1
+            while i < n:
+                if t[i] != last:
+                    t[lasti] = last = t[i]
+                    lasti += 1
+                i += 1
+            return t[:lasti]
+    
+        # Brute force is all that's left.
+        u = []
+        for x in s:
+            if x not in u:
+                u.append(x)
+        return u
+        
+    def _sortByVal(self, dict, reverse=0):
+        if type(dict) is not type({}): return []
+        keys = dict.keys()
+        s = map(lambda k: (dict[k], k), keys)
+        s.sort()
+        if reverse: s.reverse()
+        return s
+        
+    def finalize(self, resultset):
+        ##
+        # A resultset is a dictionary of all values returned by your
+        # handler functions -- except they are unique and show how many
+        # times each tuple occurs.
+        # See epylog.Result for some convenience methods to use when
+        # processing and analyzing the results.
+        #
+        
+        hostloc = {} # key = host, val = [loc, loc, loc]
+        hosttotal = {} # key = host val = totalwbytes
+        
+        foo = "<table border=0>\n\t<tr>\n"
+        
+        for pid in self.rsync_pid_host.keys():
+            (host, loc) = self.rsync_pid_host[pid]
+            if self.rsync_pid_bytes.has_key(pid):
+                if not hostloc.has_key(host):
+                    hostloc[host] = []
+                if not hosttotal.has_key(host):
+                    hosttotal[host] = 0L
+                hostloc[host].append(loc)
+                bytes = long(self.rsync_pid_bytes[pid][0])
+                hosttotal[host] += bytes
+        
+        for host in hostloc.keys():
+            hostloc[host] = self._uniq(hostloc[host])
+        
+        hosts = self._sortByVal(hosttotal, 1)
+        count = 0L
+        for (tot,host) in hosts[:self.topcount]:
+            if count % 2:
+                bgcolor = "#dddddd"
+            else:
+                bgcolor = "#ffffff"
+            count+=1
+            line = '\t\t<td bgcolor=%s valign="top">%s</td>\n' % (bgcolor, host)
+            line = line + '\t\t<td bgcolor=%s valign="top">\n' % bgcolor
+            for loc in hostloc[host]:
+                line = line + '\t\t\t%s<br>\n' % loc
+            line = line + '\t\t</td>\n'
+            size, marker = self.mk_size_unit(hosttotal[host])
+            line = line + '\t\t<td bgcolor=%s valign="top">%s%s</td>\n' % (bgcolor, size, marker)
+            line = line + '\t</tr>\n'
+            foo = foo + line
+        foo = foo + '</table>\n'
+        return foo
+
+##
+# This is useful when testing your module out.
+# Invoke without command-line parameters to learn about the proper
+# invocation.
+#
+if __name__ == '__main__':
+    from epylog.helpers import ModuleTest
+    ModuleTest(rsyncd_mod, sys.argv)