pyRdfa.rdflibparsers

Extraction parsers for structured data embedded into HTML or XML files. The former may include RDFa or microdata. The syntax and the extraction procedures are based on:

License: W3C Software License, http://www.w3.org/Consortium/Legal/copyright-software Author: Ivan Herman Copyright: W3C

  1#!/usr/bin/env python
  2"""
  3Extraction parsers for structured data embedded into HTML or XML files.
  4The former may include RDFa or microdata. The syntax and the extraction
  5procedures are based on:
  6
  7* The RDFa specifications: http://www.w3.org/TR/#tr_RDFa
  8* The microdata specification: http://www.w3.org/TR/microdata/
  9* The specification of the microdata to RDF conversion:
 10http://www.w3.org/TR/microdata-rdf/
 11
 12License: W3C Software License,
 13http://www.w3.org/Consortium/Legal/copyright-software
 14Author: Ivan Herman
 15Copyright: W3C
 16
 17"""
 18
 19from rdflib.parser import (
 20    Parser, StringInputSource, URLInputSource, FileInputSource)
 21
 22from . import pyRdfa, Options
 23from .embeddedRDF import handle_embeddedRDF
 24from .state import ExecutionContext
 25
 26try:
 27    import html5lib
 28    assert html5lib
 29    html5lib = True
 30except ImportError:
 31    import warnings
 32    warnings.warn(
 33        'html5lib not found! RDFa and Microdata ' +
 34        'parsers will not be available.')
 35    html5lib = False
 36
 37
 38def _get_orig_source(source):
 39    """
 40    A bit of a hack; the RDFa/microdata parsers need more than what the
 41    upper layers of RDFLib provide...
 42    This method returns the original source references.
 43    """
 44    if isinstance(source, StringInputSource):
 45        orig_source = source.getByteStream()
 46    elif isinstance(source, URLInputSource):
 47        orig_source = source.url
 48    elif isinstance(source, FileInputSource):
 49        orig_source = source.file.name
 50        source.file.close()
 51    else:
 52        orig_source = source.getByteStream()
 53    baseURI = source.getPublicId()
 54    return (baseURI, orig_source)
 55
 56
 57def _check_error(graph):
 58    from . import RDFA_Error, ns_rdf
 59    from .options import ns_dc
 60    for s, _p, _o in graph.triples((None, ns_rdf["type"], RDFA_Error)):
 61        for _x, _y, msg in graph.triples((s, ns_dc["description"], None)):
 62            raise Exception("RDFa parsing Error! %s" % msg)
 63
 64
 65# This is the parser interface as it would look when called from the
 66# rest of RDFLib
 67class RDFaParser(Parser):
 68    """
 69    Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1
 70    processing, see the relevant W3C documents at
 71    http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG
 72    and, in general, for any XML language.
 73
 74    Note that the parser can also handle RDFa 1.0 if the extra parameter is
 75    used and/or the input source uses RDFa 1.0 specific @version or DTD-s.
 76    """
 77    def parse(self, source, graph,
 78              pgraph=None,
 79              media_type="",
 80              rdfa_version=None,
 81              embedded_rdf=False,
 82              space_preserve=True,
 83              vocab_expansion=False,
 84              vocab_cache=False,
 85              refresh_vocab_cache=False,
 86              vocab_cache_report=False,
 87              check_lite=False):
 88        """
 89        @param source: one of the input sources that the RDFLib package defined
 90        @type source: InputSource class instance
 91        @param graph: target graph for the triples; output graph, in RDFa spec.
 92        parlance
 93        @type graph: RDFLib Graph
 94        @keyword pgraph: target for error and warning triples; processor graph,
 95        in RDFa spec. parlance. If set to None, these triples are ignored
 96        @type pgraph: RDFLib Graph
 97        @keyword media_type: explicit setting of the preferred media type
 98        (a.k.a. content type) of the the RDFa source. None means the content
 99        type of the HTTP result is used, or a guess is made based on the
100        suffix of a file
101        @type media_type: string
102        @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by
103        default, 1.1 is used unless the source has explicit signals to use
104        1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc)
105        @type rdfa_version: string
106        @keyword embedded_rdf: some formats allow embedding RDF in other
107        formats: (X)HTML can contain turtle in a special <script> element,
108        SVG can have RDF/XML embedded in a <metadata> element. This flag
109        controls whether those triples should be interpreted and added to
110        the output graph. Some languages (e.g., SVG) require this, and the
111        flag is ignored.
112        @type embedded_rdf: Boolean
113        @keyword space_preserve: by default, space in the HTML source must be preserved in the generated literal;
114        this behavior can be switched off
115        @type space_preserve: Boolean
116        @keyword vocab_expansion: whether the RDFa @vocab attribute should
117        also mean vocabulary expansion (see the RDFa 1.1 spec for further
118        details)
119        @type vocab_expansion: Boolean
120        @keyword vocab_cache: in case vocab expansion is used, whether the
121        expansion data (i.e., vocabulary) should be cached locally. This
122        requires the ability for the local application to write on the
123        local file system
124        @type vocab_chache: Boolean
125        @keyword vocab_cache_report: whether the details of vocabulary file caching process should be reported
126        in the processor graph as information (mainly useful for debug)
127        @type vocab_cache_report: Boolean
128        @keyword refresh_vocab_cache: whether the caching checks of vocabs should be by-passed, ie, if caches should be re-generated regardless of the stored date (important for vocab development)
129        @type refresh_vocab_cache: Boolean
130        @keyword check_lite: generate extra warnings in case the input source is not RDFa 1.1 check_lite
131        @type check_lite: Boolean
132        """
133
134        if html5lib is False:
135            raise ImportError(
136                'html5lib is not installed, cannot use ' +
137                'RDFa and Microdata parsers.')
138
139        (baseURI, orig_source) = _get_orig_source(source)
140        self._process(graph, pgraph, baseURI, orig_source,
141                      media_type=media_type,
142                      rdfa_version=rdfa_version,
143                      embedded_rdf=embedded_rdf,
144                      space_preserve=space_preserve,
145                      vocab_expansion=vocab_expansion,
146                      vocab_cache=vocab_cache,
147                      vocab_cache_report=vocab_cache_report,
148                      refresh_vocab_cache=refresh_vocab_cache,
149                      check_lite=check_lite
150                      )
151
152    def _process(self, graph, pgraph, baseURI, orig_source,
153                 media_type="",
154                 rdfa_version=None,
155                 embedded_rdf=False,
156                 space_preserve=True,
157                 vocab_expansion=False,
158                 vocab_cache=False,
159                 vocab_cache_report=False,
160                 refresh_vocab_cache=False,
161                 check_lite=False):
162
163        from rdflib import Graph
164        processor_graph = pgraph if pgraph is not None else Graph()
165        self.options = Options(output_processor_graph=True,
166                               embedded_rdf=embedded_rdf,
167                               space_preserve=space_preserve,
168                               vocab_expansion=vocab_expansion,
169                               vocab_cache=vocab_cache,
170                               vocab_cache_report=vocab_cache_report,
171                               refresh_vocab_cache=refresh_vocab_cache,
172                               check_lite=check_lite)
173
174        if media_type is None:
175            media_type = ""
176        processor = pyRdfa(self.options,
177                           base=baseURI,
178                           media_type=media_type,
179                           rdfa_version=rdfa_version)
180        processor.graph_from_source(orig_source, graph=graph, pgraph=processor_graph, rdfOutput=False)
181        # This may result in an exception if the graph parsing led to an error
182        _check_error(processor_graph)
183
184
185class RDFa10Parser(Parser):
186    """
187    This is just a convenience class to wrap around the RDFa 1.0 parser.
188    """
189    def parse(self, source, graph, pgraph=None, media_type=""):
190        """
191        @param source: one of the input sources that the RDFLib package defined
192        @type source: InputSource class instance
193        @param graph: target graph for the triples; output graph, in RDFa
194        spec. parlance
195        @type graph: RDFLib Graph
196        @keyword pgraph: target for error and warning triples; processor
197        graph, in RDFa spec. parlance. If set to None, these triples are
198        ignored
199        @type pgraph: RDFLib Graph
200        @keyword media_type: explicit setting of the preferred media type
201        (a.k.a. content type) of the the RDFa source. None means the content
202        type of the HTTP result is used, or a guess is made based on the
203        suffix of a file
204        @type media_type: string
205        @keyword rdfOutput: whether Exceptions should be catched and added,
206        as triples, to the processor graph, or whether they should be raised.
207        @type rdfOutput: Boolean
208        """
209        RDFaParser().parse(source, graph, pgraph=pgraph,
210                           media_type=media_type, rdfa_version="1.0")
211
212
213
214
215class StructuredDataParser(Parser):
216    """
217    Convenience parser to extract both RDFa (including embedded Turtle)
218    and microdata from an HTML file.
219    It is simply a wrapper around the specific parsers.
220    """
221    def parse(self, source, graph,
222              pgraph=None,
223              rdfa_version="",
224              vocab_expansion=False,
225              vocab_cache=False,
226              media_type='text/html'
227              ):
228        """
229        @param source: one of the input sources that the RDFLib package defined
230        @type source: InputSource class instance
231        @param graph: target graph for the triples; output graph, in RDFa
232        spec. parlance
233        @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by
234        default, 1.1 is used unless the source has explicit signals to use 1.0
235        (e.g., using a @version attribute, using a DTD set up for 1.0, etc)
236        @type rdfa_version: string
237        @type graph: RDFLib Graph
238        @keyword pgraph: target for error and warning triples; processor
239        graph, in RDFa spec. parlance. If set to None, these triples are
240        ignored
241        @type pgraph: RDFLib Graph
242        @keyword vocab_expansion: whether the RDFa @vocab attribute should
243        also mean vocabulary expansion (see the RDFa 1.1 spec for further
244            details)
245        @type vocab_expansion: Boolean
246        @keyword vocab_cache: in case vocab expansion is used, whether the
247        expansion data (i.e., vocabulary) should be cached locally. This
248        requires the ability for the local application to write on the
249        local file system
250        @type vocab_chache: Boolean
251        @keyword rdfOutput: whether Exceptions should be catched and added,
252        as triples, to the processor graph, or whether they should be raised.
253        @type rdfOutput: Boolean
254        """
255        # Note that the media_type argument is ignored, and is here only to avoid an 'unexpected argument' error.
256        # This parser works for text/html only anyway...
257        (baseURI, orig_source) = _get_orig_source(source)
258        if rdfa_version == "" : rdfa_version = "1.1"
259        RDFaParser()._process(graph, pgraph, baseURI, orig_source,
260                              media_type='text/html',
261                              rdfa_version=rdfa_version,
262                              vocab_expansion=vocab_expansion,
263                              vocab_cache=vocab_cache)
264
265        try:
266            from pyMicrodata.rdflibparsers import MicrodataParser
267            MicrodataParser()._process(graph, baseURI, orig_source)
268        except ImportError:
269            warnings.warn('pyMicrodata not installed, will only parse RDFa')
270
271        HTurtleParser()._process(graph, baseURI, orig_source, media_type='text/html')
272
273
274
275class HTurtle(pyRdfa):
276    """
277    Bastardizing the RDFa 1.1 parser to do a hturtle extractions
278    """
279    def __init__(self, options=None, base="", media_type=""):
280        pyRdfa.__init__(self, options=options, base=base,
281                        media_type=media_type, rdfa_version="1.1")
282
283    def graph_from_DOM(self, dom, graph, pgraph=None):
284        """
285        Stealing the parsing function from the original class, to do
286        turtle extraction only
287        """
288
289        def copyGraph(tog, fromg):
290            for t in fromg:
291                tog.add(t)
292            for k, ns in fromg.namespaces():
293                tog.bind(k, ns)
294
295        def _process_one_node(node, graph, state):
296            if handle_embeddedRDF(node, graph, state):
297                # we got an RDF content that has been extracted into Graph;
298                # the recursion should stop
299                return
300            else:
301                # recurse through all the child elements of the current node
302                for n in node.childNodes:
303                    if n.nodeType == node.ELEMENT_NODE:
304                        _process_one_node(n, graph, state)
305
306        topElement = dom.documentElement
307        state = ExecutionContext(topElement, graph, base=self.base,
308                                 options=self.options, rdfa_version="1.1")
309        _process_one_node(topElement, graph, state)
310        if pgraph is not None:
311            copyGraph(pgraph, self.options.processor_graph.graph)
312
313# This is the parser interface as it would look when called from the rest of
314# RDFLib
315
316
317class HTurtleParser(Parser):
318    def parse(self, source, graph, pgraph=None, media_type=""):
319        """
320        @param source: one of the input sources that the RDFLib package defined
321        @type source: InputSource class instance
322        @param graph: target graph for the triples; output graph, in RDFa spec.
323        parlance
324        @type graph: RDFLib Graph
325        @keyword media_type: explicit setting of the preferred media type
326        (a.k.a. content type) of the the RDFa source. None means the content
327        type of the HTTP result is used, or a guess is made based on the
328        suffix of a file
329        @type media_type: string
330        """
331        if html5lib is False:
332            raise ImportError(
333                'html5lib is not installed, cannot ' +
334                'use RDFa and Microdata parsers.')
335
336        (baseURI, orig_source) = _get_orig_source(source)
337        self._process(
338            graph, pgraph, baseURI, orig_source, media_type=media_type)
339
340    def _process(self, graph, baseURI, orig_source, media_type=""):
341        self.options = Options(output_processor_graph=None,
342                               embedded_rdf=True,
343                               vocab_expansion=False,
344                               vocab_cache=False)
345
346        if media_type is None:
347            media_type = ""
348        processor = HTurtle(
349            self.options, base=baseURI, media_type=media_type)
350        processor.graph_from_source(
351            orig_source, graph=graph, pgraph=None, rdfOutput=False)
352        # get possible error triples to raise exceptions
353        _check_error(graph)
class RDFaParser(rdflib.parser.Parser):
 68class RDFaParser(Parser):
 69    """
 70    Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1
 71    processing, see the relevant W3C documents at
 72    http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG
 73    and, in general, for any XML language.
 74
 75    Note that the parser can also handle RDFa 1.0 if the extra parameter is
 76    used and/or the input source uses RDFa 1.0 specific @version or DTD-s.
 77    """
 78    def parse(self, source, graph,
 79              pgraph=None,
 80              media_type="",
 81              rdfa_version=None,
 82              embedded_rdf=False,
 83              space_preserve=True,
 84              vocab_expansion=False,
 85              vocab_cache=False,
 86              refresh_vocab_cache=False,
 87              vocab_cache_report=False,
 88              check_lite=False):
 89        """
 90        @param source: one of the input sources that the RDFLib package defined
 91        @type source: InputSource class instance
 92        @param graph: target graph for the triples; output graph, in RDFa spec.
 93        parlance
 94        @type graph: RDFLib Graph
 95        @keyword pgraph: target for error and warning triples; processor graph,
 96        in RDFa spec. parlance. If set to None, these triples are ignored
 97        @type pgraph: RDFLib Graph
 98        @keyword media_type: explicit setting of the preferred media type
 99        (a.k.a. content type) of the the RDFa source. None means the content
100        type of the HTTP result is used, or a guess is made based on the
101        suffix of a file
102        @type media_type: string
103        @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by
104        default, 1.1 is used unless the source has explicit signals to use
105        1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc)
106        @type rdfa_version: string
107        @keyword embedded_rdf: some formats allow embedding RDF in other
108        formats: (X)HTML can contain turtle in a special <script> element,
109        SVG can have RDF/XML embedded in a <metadata> element. This flag
110        controls whether those triples should be interpreted and added to
111        the output graph. Some languages (e.g., SVG) require this, and the
112        flag is ignored.
113        @type embedded_rdf: Boolean
114        @keyword space_preserve: by default, space in the HTML source must be preserved in the generated literal;
115        this behavior can be switched off
116        @type space_preserve: Boolean
117        @keyword vocab_expansion: whether the RDFa @vocab attribute should
118        also mean vocabulary expansion (see the RDFa 1.1 spec for further
119        details)
120        @type vocab_expansion: Boolean
121        @keyword vocab_cache: in case vocab expansion is used, whether the
122        expansion data (i.e., vocabulary) should be cached locally. This
123        requires the ability for the local application to write on the
124        local file system
125        @type vocab_chache: Boolean
126        @keyword vocab_cache_report: whether the details of vocabulary file caching process should be reported
127        in the processor graph as information (mainly useful for debug)
128        @type vocab_cache_report: Boolean
129        @keyword refresh_vocab_cache: whether the caching checks of vocabs should be by-passed, ie, if caches should be re-generated regardless of the stored date (important for vocab development)
130        @type refresh_vocab_cache: Boolean
131        @keyword check_lite: generate extra warnings in case the input source is not RDFa 1.1 check_lite
132        @type check_lite: Boolean
133        """
134
135        if html5lib is False:
136            raise ImportError(
137                'html5lib is not installed, cannot use ' +
138                'RDFa and Microdata parsers.')
139
140        (baseURI, orig_source) = _get_orig_source(source)
141        self._process(graph, pgraph, baseURI, orig_source,
142                      media_type=media_type,
143                      rdfa_version=rdfa_version,
144                      embedded_rdf=embedded_rdf,
145                      space_preserve=space_preserve,
146                      vocab_expansion=vocab_expansion,
147                      vocab_cache=vocab_cache,
148                      vocab_cache_report=vocab_cache_report,
149                      refresh_vocab_cache=refresh_vocab_cache,
150                      check_lite=check_lite
151                      )
152
153    def _process(self, graph, pgraph, baseURI, orig_source,
154                 media_type="",
155                 rdfa_version=None,
156                 embedded_rdf=False,
157                 space_preserve=True,
158                 vocab_expansion=False,
159                 vocab_cache=False,
160                 vocab_cache_report=False,
161                 refresh_vocab_cache=False,
162                 check_lite=False):
163
164        from rdflib import Graph
165        processor_graph = pgraph if pgraph is not None else Graph()
166        self.options = Options(output_processor_graph=True,
167                               embedded_rdf=embedded_rdf,
168                               space_preserve=space_preserve,
169                               vocab_expansion=vocab_expansion,
170                               vocab_cache=vocab_cache,
171                               vocab_cache_report=vocab_cache_report,
172                               refresh_vocab_cache=refresh_vocab_cache,
173                               check_lite=check_lite)
174
175        if media_type is None:
176            media_type = ""
177        processor = pyRdfa(self.options,
178                           base=baseURI,
179                           media_type=media_type,
180                           rdfa_version=rdfa_version)
181        processor.graph_from_source(orig_source, graph=graph, pgraph=processor_graph, rdfOutput=False)
182        # This may result in an exception if the graph parsing led to an error
183        _check_error(processor_graph)

Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1 processing, see the relevant W3C documents at http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG and, in general, for any XML language.

Note that the parser can also handle RDFa 1.0 if the extra parameter is used and/or the input source uses RDFa 1.0 specific @version or DTD-s.

def parse( self, source, graph, pgraph=None, media_type='', rdfa_version=None, embedded_rdf=False, space_preserve=True, vocab_expansion=False, vocab_cache=False, refresh_vocab_cache=False, vocab_cache_report=False, check_lite=False):
 78    def parse(self, source, graph,
 79              pgraph=None,
 80              media_type="",
 81              rdfa_version=None,
 82              embedded_rdf=False,
 83              space_preserve=True,
 84              vocab_expansion=False,
 85              vocab_cache=False,
 86              refresh_vocab_cache=False,
 87              vocab_cache_report=False,
 88              check_lite=False):
 89        """
 90        @param source: one of the input sources that the RDFLib package defined
 91        @type source: InputSource class instance
 92        @param graph: target graph for the triples; output graph, in RDFa spec.
 93        parlance
 94        @type graph: RDFLib Graph
 95        @keyword pgraph: target for error and warning triples; processor graph,
 96        in RDFa spec. parlance. If set to None, these triples are ignored
 97        @type pgraph: RDFLib Graph
 98        @keyword media_type: explicit setting of the preferred media type
 99        (a.k.a. content type) of the the RDFa source. None means the content
100        type of the HTTP result is used, or a guess is made based on the
101        suffix of a file
102        @type media_type: string
103        @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by
104        default, 1.1 is used unless the source has explicit signals to use
105        1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc)
106        @type rdfa_version: string
107        @keyword embedded_rdf: some formats allow embedding RDF in other
108        formats: (X)HTML can contain turtle in a special <script> element,
109        SVG can have RDF/XML embedded in a <metadata> element. This flag
110        controls whether those triples should be interpreted and added to
111        the output graph. Some languages (e.g., SVG) require this, and the
112        flag is ignored.
113        @type embedded_rdf: Boolean
114        @keyword space_preserve: by default, space in the HTML source must be preserved in the generated literal;
115        this behavior can be switched off
116        @type space_preserve: Boolean
117        @keyword vocab_expansion: whether the RDFa @vocab attribute should
118        also mean vocabulary expansion (see the RDFa 1.1 spec for further
119        details)
120        @type vocab_expansion: Boolean
121        @keyword vocab_cache: in case vocab expansion is used, whether the
122        expansion data (i.e., vocabulary) should be cached locally. This
123        requires the ability for the local application to write on the
124        local file system
125        @type vocab_chache: Boolean
126        @keyword vocab_cache_report: whether the details of vocabulary file caching process should be reported
127        in the processor graph as information (mainly useful for debug)
128        @type vocab_cache_report: Boolean
129        @keyword refresh_vocab_cache: whether the caching checks of vocabs should be by-passed, ie, if caches should be re-generated regardless of the stored date (important for vocab development)
130        @type refresh_vocab_cache: Boolean
131        @keyword check_lite: generate extra warnings in case the input source is not RDFa 1.1 check_lite
132        @type check_lite: Boolean
133        """
134
135        if html5lib is False:
136            raise ImportError(
137                'html5lib is not installed, cannot use ' +
138                'RDFa and Microdata parsers.')
139
140        (baseURI, orig_source) = _get_orig_source(source)
141        self._process(graph, pgraph, baseURI, orig_source,
142                      media_type=media_type,
143                      rdfa_version=rdfa_version,
144                      embedded_rdf=embedded_rdf,
145                      space_preserve=space_preserve,
146                      vocab_expansion=vocab_expansion,
147                      vocab_cache=vocab_cache,
148                      vocab_cache_report=vocab_cache_report,
149                      refresh_vocab_cache=refresh_vocab_cache,
150                      check_lite=check_lite
151                      )

@param source: one of the input sources that the RDFLib package defined @type source: InputSource class instance @param graph: target graph for the triples; output graph, in RDFa spec. parlance @type graph: RDFLib Graph @keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored @type pgraph: RDFLib Graph @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source. None means the content type of the HTTP result is used, or a guess is made based on the suffix of a file @type media_type: string @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by default, 1.1 is used unless the source has explicit signals to use 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) @type rdfa_version: string @keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special