[PATCH 1/3] Preliminary linktest work

Paul W. Frields stickster at gmail.com
Wed Aug 19 19:24:31 UTC 2015


---
 .gitignore          |   2 +-
 Makefile.in         |   3 +
 tools/linkcheckerrc | 272 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 tools/linkcheckerrc

diff --git a/.gitignore b/.gitignore
index d314cc7..1209aa3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,4 @@ httpd/logs/*_log
 httpd/run/*.pid
 */out
 */data/templates/translations.html
-
+*/linkchecker-out.csv
diff --git a/Makefile.in b/Makefile.in
index 4251629..3091f6d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -84,6 +84,9 @@ stoptest: clean
 	pid=`cat $(HTTPDDIR)/run/httpd.pid` && \
 	kill -TERM $$pid
 
+linktest:
+	linkchecker -q -f ../tools/linkcheckerrc -v http://localhost:5000
+
 clean:
 	rm -rf out
 	rm -f po/*.mo
diff --git a/tools/linkcheckerrc b/tools/linkcheckerrc
new file mode 100644
index 0000000..a24b536
--- /dev/null
+++ b/tools/linkcheckerrc
@@ -0,0 +1,272 @@
+# Sample configuration file; see the linkcheckerrc(5) man page or
+# execute linkchecker -h for help on these options.
+# Commandline or GUI options override these settings.
+
+##################### output configuration ##########################
+[output]
+# enable debug messages; see 'linkchecker -h' for valid debug names
+#debug=all
+# print status output
+#status=1
+# change the logging type
+#log=xml
+# turn on/off --verbose
+#verbose=1
+# turn on/off --warnings
+#warnings=0
+# turn on/off --quiet
+quiet=1
+# additional file output
+#fileoutput = text, html, gml, sql
+fileoutput = csv
+
+
+##################### logger configuration ##########################
+# Note that the logger configuration is ignored by the linkchecker-gui
+# program. Results in the GUI can be saved to a file with the command
+# File -> Save results.
+#
+# logger output part names:
+# all       For all parts
+# realurl   The full url link
+# result    Valid or invalid, with messages
+# extern    1 or 0, only in some logger types reported
+# base      <base href=...>
+# name      <a href=...>name</a> and <img alt="name">
+# parenturl The referrer URL if there is any
+# info      Some additional info, e.g. FTP welcome messages
+# warning   Warnings
+# dltime    Download time
+# checktime Check time
+# url       The original url name, can be relative
+# intro     The blurb at the beginning, "starting at ..."
+# outro     The blurb at the end, "found x errors ..."
+# stats     Statistics including URL lengths and contents.
+
+# each Logger can have separate configuration parameters
+
+# standard text logger
+[text]
+#filename=linkchecker-out.txt
+#parts=all
+# colors for the various parts, syntax is <color> or <type>;<color>
+# type can be bold, light, blink, invert
+# color can be default, black, red, green, yellow, blue, purple, cyan, white,
+# Black, Red, Green, Yellow, Blue, Purple, Cyan, White
+#colorparent=white
+#colorurl=default
+#colorname=default
+#colorreal=cyan
+#colorbase=purple
+#colorvalid=bold;green
+#colorinvalid=bold;red
+#colorinfo=default
+#colorwarning=bold;yellow
+#colordltime=default
+#colorreset=default
+
+# GML logger
+[gml]
+#filename=linkchecker-out.gml
+#parts=all
+# valid encodings are listed in http://docs.python.org/library/codecs.html#standard-encodings
+# default encoding is iso-8859-15
+#encoding=utf_16
+
+# DOT logger
+[dot]
+#filename=linkchecker-out.dot
+#parts=all
+# default encoding is ascii since the original DOT format does not
+# support other charsets
+#encoding=iso-8859-15
+
+# CSV logger
+[csv]
+#filename=linkchecker-out.csv
+separator=,
+quotechar="
+#parts=all
+
+# SQL logger
+[sql]
+#filename=linkchecker-out.sql
+#dbname=linksdb
+#separator=;
+#parts=all
+
+# HTML logger
+[html]
+#filename=linkchecker-out.html
+# colors for the various parts
+#colorbackground=#fff7e5
+#colorurl=#dcd5cf
+#colorborder=#000000
+#colorlink=#191c83
+#colorwarning=#e0954e
+#colorerror=#db4930
+#colorok=#3ba557
+#parts=all
+
+# blacklist logger
+[blacklist]
+#filename=~/.linkchecker/blacklist
+
+# custom xml logger
+[xml]
+#encoding=iso-8859-1
+
+# GraphXML logger
+[gxml]
+#encoding=iso-8859-1
+
+# Sitemap logger
+[sitemap]
+#priority=0.7
+#frequency=weekly
+
+
+##################### checking options ##########################
+[checking]
+# number of threads
+threads=100
+# connection timeout in seconds
+timeout=10
+# Time to wait for checks to finish after the user aborts the first time
+# (with Ctrl-C or the abort button).
+#aborttimeout=300
+# The recursion level determines how many times links inside pages are followed.
+#recursionlevel=1
+# Basic NNTP server. Overrides NNTP_SERVER environment variable.
+#nntpserver=
+# parse a cookiefile for initial cookie data
+#cookiefile=/path/to/cookies.txt
+# User-Agent header string to send to HTTP web servers
+# Note that robots.txt are always checked with the original User-Agent.
+useragent=Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
+# When checking finishes, write a memory dump to a temporary file.
+# The memory dump is written both when checking finishes normally
+# and when checking gets canceled.
+# The memory dump only works if the python-meliae package is installed.
+# Otherwise a warning is printed to install it.
+#debugmemory=0
+# When checking absolute URLs inside local files, the given root directory
+# is used as base URL.
+# Note that the given directory must have URL syntax, so it must use a slash
+# to join directories instead of a backslash.
+# And the given directory must end with a slash.
+# Unix example:
+#localwebroot=/var/www/
+# Windows example:
+#localwebroot=/C|/public_html/
+# Check SSL certificates. Set to an absolute pathname for a custom
+# CA cert bundle to use. Set to zero to disable SSL certificate verification.
+sslverify=0
+# Stop checking new URLs after the given number of seconds. Same as if the
+# user hits Ctrl-C after X seconds.
+#maxrunseconds=600
+# Maximum number of URLs to check. New URLs will not be queued after the
+# given number of URLs is checked.
+#maxnumurls=153
+# Maximum number of requests per second to one host.
+#maxrequestspersecond=10
+# Allowed URL schemes as a comma-separated list.
+#allowedschemes=http,https
+
+##################### filtering options ##########################
+[filtering]
+#ignore=
+# ignore everything with 'lconline' in the URL name
+#  lconline
+# and ignore everything with 'bookmark' in the URL name
+#  bookmark
+# and ignore all mailto: URLs
+#  ^mailto:
+# do not recurse into the following URLs
+
+#nofollow=
+# just an example
+#  http://www\.example\.com/bla
+
+# Ignore specified warnings (see linkchecker -h for the list of
+# recognized warnings). Add a comma-separated list of warnings here
+# that prevent a valid URL from being logged. Note that the warning
+# will be logged in invalid URLs.
+#ignorewarnings=url-unicode-domain
+# Regular expression to add more URLs recognized as internal links.
+# Default is that URLs given on the command line are internal.
+#internlinks=^http://www\.example\.net/
+# Check external links
+checkextern=1
+
+
+##################### password authentication ##########################
+[authentication]
+# WARNING: if you store passwords in this configuration entry, make sure the
+# configuration file is not readable by other users.
+# Different user/password pairs for different URLs can be provided.
+# Entries are a triple (URL regular expression, username, password),
+# separated by whitespace.
+# If the regular expression matches, the given user/password pair is used
+# for authentication. The commandline options -u,-p match every link
+# and therefore override the entries given here. The first match wins.
+# At the moment, authentication is used for http[s] and ftp links.
+#entry=
+# Note that passwords are optional. If any passwords are stored here,
+# this file should not readable by other users.
+#  ^https?://www\.example\.com/~calvin/ calvin mypass
+#  ^ftp://www\.example\.com/secret/ calvin
+
+# if the website requires a login the URL and optionally the user and
+# password CGI fieldnames can be provided.
+#loginurl=http://www.example.com/
+
+# The name of the user and password CGI field
+#loginuserfield=login
+#loginpasswordfield=password
+# Optionally any additional CGI name/value pairs. Note that the default
+# values are submitted automatically.
+#loginextrafields=
+#  name1:value1
+#  name 2:value 2
+
+############################ Plugins ###################################
+#
+# uncomment sections to enable plugins
+
+# Check HTML anchors
+#[AnchorCheck]
+
+# Print HTTP header info
+#[HttpHeaderInfo]
+# Comma separated list of header prefixes to print.
+# The names are case insensitive.
+# The default list is empty, so it should be non-empty when activating
+# this plugin.
+#prefixes=Server,X-
+
+# Add country info to URLs
+#[LocationInfo]
+
+# Run W3C syntax checks
+#[CssSyntaxCheck]
+#[HtmlSyntaxCheck]
+
+# Search for regular expression in page contents
+#[RegexCheck]
+#warningregex=Oracle Error
+
+# Search for viruses in page contents
+#[VirusCheck]
+#clamavconf=/etc/clamav/clam.conf
+
+# Check that SSL certificates are at least the given number of days valid.
+#[SslCertificateCheck]
+#sslcertwarndays=14
+
+# Parse and check links in PDF files
+#[PdfParser]
+
+# Parse and check links in Word files
+#[WordParser]
+
-- 
2.4.3




More information about the websites mailing list