commit 05f9135f63fa736ceeb23480028f959b2f8881a7
Author: Radek Pazdera <rpazdera(a)redhat.com>
Date: Thu Sep 12 14:03:30 2013 +0200
XmlParser: Improved XInclude Error Reporting
Due to the way XInclude is implemented in lxml, we are unable to get the
origin of each tag in the document. This patch provides our own
simplified implementation of XInclude resolving that fixes this problem.
Signed-off-by: Radek Pazdera <rpazdera(a)redhat.com>
Signed-off-by: Jiri Pirko <jiri(a)resnulli.us>
lnst/Common/XmlParser.py | 115 +++++++++++++++++++++++++++++++++------
lnst/Common/XmlProcessing.py | 36 +++++++++---
lnst/Controller/RecipeParser.py | 7 ---
3 files changed, 126 insertions(+), 32 deletions(-)
---
diff --git a/lnst/Common/XmlParser.py b/lnst/Common/XmlParser.py
index 68d3c76..c404c0b 100644
--- a/lnst/Common/XmlParser.py
+++ b/lnst/Common/XmlParser.py
@@ -11,13 +11,17 @@ rpazdera(a)redhat.com (Radek Pazdera)
"""
import os
+import re
import sys
import logging
+import copy
from lxml import etree
from lnst.Common.XmlTemplates import XmlTemplates, XmlTemplateError
from lnst.Common.XmlProcessing import XmlProcessingError, XmlData
class XmlParser(object):
+ XINCLUDE_RE = r"\{http\:\/\/www\.w3\.org\/[0-9]{4}\/XInclude\}include"
+
def __init__(self, schema_file, xml_path):
# locate the schema file
# try git path
@@ -39,9 +43,43 @@ class XmlParser(object):
self._schema = etree.RelaxNG(relaxng_doc)
def parse(self):
+ doc = self._parse(self._path)
+ self._remove_comments(doc)
+
+ # Due to a weird implementation of XInclude in lxml, the
+ # XmlParser resolves included documents on it's own.
+ #
+ # To be able to tell later on where each tag was located
+ # in the XML document, we add a '__file' attribute to
+ # each element of the tree during the parsing.
+ #
+ # However, these special attributes are of course not
+ # valid according to our schemas. To solve this, a copy of
+ # the tree is made and the '__file' attributes are removed
+ # before validation.
+ #
+ # XXX This is a *EXTREMELY* dirty hack. Ideas/proposals
+ # for cleaner solutions are more than welcome!
+ root_tag = self._init_loc(doc.getroot(), self._path)
+ self._expand_xinclude(root_tag, os.path.dirname(self._path))
+
+ self._template_proc.process_aliases(root_tag)
+
try:
- doc = etree.parse(self._path)
- doc.xinclude()
+ self._validate(doc)
+ except:
+ err = self._schema.error_log[0]
+ loc = {"file": os.path.basename(err.filename),
+ "line": err.line, "col": err.column}
+ exc = XmlProcessingError(err.message)
+ exc.set_loc(loc)
+ raise exc
+
+ return self._process(root_tag)
+
+ def _parse(self, path):
+ try:
+ doc = etree.parse(path)
except Exception as err:
# A workaround for cases when lxml (quite strangely)
# sets the filename to <string>.
@@ -56,20 +94,7 @@ class XmlParser(object):
exc.set_loc(loc)
raise exc
- root_tag = doc.getroot()
- self._template_proc.process_aliases(root_tag)
-
- try:
- self._schema.assertValid(doc)
- except:
- err = self._schema.error_log[0]
- loc = {"file": os.path.basename(err.filename),
- "line": err.line, "col": err.column}
- exc = XmlProcessingError(err.message)
- exc.set_loc(loc)
- raise exc
-
- return self._process(root_tag)
+ return doc
def _process(self, root_tag):
pass
@@ -86,3 +111,61 @@ class XmlParser(object):
def _get_content(self, element):
text = etree.tostring(element, method="text")
return self._template_proc.expand_functions(text)
+
+ def _expand_xinclude(self, elem, base_url=""):
+ for e in elem:
+ if re.match(self.XINCLUDE_RE, str(e.tag)):
+ href = os.path.join(base_url, e.get("href"))
+ filename = os.path.basename(href)
+
+ doc = self._parse(href)
+ self._remove_comments(doc)
+ node = doc.getroot()
+
+ node = self._init_loc(node, href)
+
+ if e.tail:
+ node.tail = (node.tail or "") + e.tail
+ self._expand_xinclude(node, os.path.dirname(href))
+
+ parent = e.getparent()
+ if parent is None:
+ return node
+
+ parent.replace(e, node)
+ return elem
+
+ def _remove_comments(self, doc):
+ comments = doc.xpath('//comment()')
+ for c in comments:
+ p = c.getparent()
+ if p is not None:
+ p.remove(c)
+
+ def _init_loc(self, elem, filename):
+ """ Remove all coment tags from the tree """
+
+ elem.attrib["__file"] = filename
+ for e in elem:
+ self._init_loc(e, os.path.basename(filename))
+
+ return elem
+
+ def _validate(self, original):
+ """
+ Make a copy of the tree, remove the '__file' attributes
+ and validate against the appropriate schema.
+
+ Very unfortunate solution.
+ """
+ doc = copy.deepcopy(original)
+ root = doc.getroot()
+
+ self._prepare_tree_for_validation(root)
+ self._schema.assertValid(doc)
+
+ def _prepare_tree_for_validation(self, elem):
+ if "__file" in elem.attrib:
+ del elem.attrib["__file"]
+ for e in elem:
+ self._prepare_tree_for_validation(e)
diff --git a/lnst/Common/XmlProcessing.py b/lnst/Common/XmlProcessing.py
index 5ea297d..771ead5 100644
--- a/lnst/Common/XmlProcessing.py
+++ b/lnst/Common/XmlProcessing.py
@@ -24,15 +24,21 @@ class XmlProcessingError(Exception):
super(XmlProcessingError, self).__init__()
self._msg = msg
- if obj is not None and hasattr(obj, "loc"):
- self.set_loc(obj.loc)
-
- loc = {}
- if obj is not None and hasattr(obj, "base") and obj.base != None:
- loc["file"] = os.path.basename(obj.base)
- if hasattr(obj, "sourceline"):
- loc["line"] = obj.sourceline
- self.set_loc(loc)
+ if obj is not None:
+ if hasattr(obj, "loc"):
+ self.set_loc(obj.loc)
+ elif hasattr(obj, "attrib") and "__file" in obj.attrib:
+ loc = {}
+ loc["file"] = obj.attrib["__file"]
+ if hasattr(obj, "sourceline"):
+ loc["line"] = obj.sourceline
+ self.set_loc(loc)
+ elif hasattr(obj, "base") and obj.base != None:
+ loc = {}
+ loc["file"] = os.path.basename(obj.base)
+ if hasattr(obj, "sourceline"):
+ loc["line"] = obj.sourceline
+ self.set_loc(loc)
def set_loc(self, loc):
@@ -89,6 +95,12 @@ class XmlCollection(list):
if node is not None:
if hasattr(node, "loc"):
self.loc = node.loc
+ elif "__file" in node.attrib:
+ loc = {}
+ loc["file"] = node.attrib["__file"]
+ if hasattr(node, "sourceline"):
+ loc["line"] = node.sourceline
+ self.loc = loc
elif hasattr(node, "base") and node.base != None:
loc = {}
loc["file"] = os.path.basename(node.base)
@@ -113,6 +125,12 @@ class XmlData(dict):
if node is not None:
if hasattr(node, "loc"):
self.loc = node.loc
+ elif "__file" in node.attrib:
+ loc = {}
+ loc["file"] = node.attrib["__file"]
+ if hasattr(node, "sourceline"):
+ loc["line"] = node.sourceline
+ self.loc = loc
elif hasattr(node, "base") and node.base != None:
loc = {}
loc["file"] = os.path.basename(node.base)
diff --git a/lnst/Controller/RecipeParser.py b/lnst/Controller/RecipeParser.py
index 2bd6c38..2720359 100644
--- a/lnst/Controller/RecipeParser.py
+++ b/lnst/Controller/RecipeParser.py
@@ -34,13 +34,6 @@ class RecipeParser(XmlParser):
def _process(self, lnst_recipe):
recipe = XmlData(lnst_recipe)
- # remove all comments from the document
- comments = lnst_recipe.xpath('//comment()')
- for c in comments:
- p = c.getparent()
- if p is not None:
- p.remove(c)
-
# machines
machines_tag = lnst_recipe.find("machines")
if machines_tag is not None: