pyRdfa.host.html5

Simple transfomer for HTML5: add a @src for any @data, add a @content for the @value attribute of the element, and interpret the

@summary: Add a top "about" to and @requires: U{RDFLib packagehttp://rdflib.net} @organization: U{World Wide Web Consortiumhttp://www.w3.org} @author: U{Ivan Herman} @license: This software is available for use under the U{W3C® SOFTWARE NOTICE AND LICENSE} @contact: Ivan Herman, ivan@w3.org

  1# -*- coding: utf-8 -*-
  2"""
  3Simple transfomer for HTML5: add a @src for any @data, add a @content for the @value attribute of the <data> element, and interpret the <time> element.
  4
  5@summary: Add a top "about" to <head> and <body>
  6@requires: U{RDFLib package<http://rdflib.net>}
  7@organization: U{World Wide Web Consortium<http://www.w3.org>}
  8@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
  9@license: This software is available for use under the
 10U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
 11@contact: Ivan Herman, ivan@w3.org
 12"""
 13
 14"""
 15$Id: html5.py,v 1.15 2014-11-04 13:18:48 ivan Exp $
 16$Date: 2014-11-04 13:18:48 $
 17"""
 18
 19from functools import reduce
 20
 21# The handling of datatime is a little bit more complex... better put this in a separate function for a better management
 22from datetime import datetime
 23import re
 24
 25datetime_type =   "http://www.w3.org/2001/XMLSchema#dateTime"
 26time_type =       "http://www.w3.org/2001/XMLSchema#time"
 27date_type =       "http://www.w3.org/2001/XMLSchema#date"
 28date_gYear =      "http://www.w3.org/2001/XMLSchema#gYear"
 29date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth"
 30date_gMonthDay =  "http://www.w3.org/2001/XMLSchema#gMonthDay"
 31duration_type =   "http://www.w3.org/2001/XMLSchema#duration"
 32plain =           "plain"
 33
 34handled_time_types = [datetime_type, time_type, date_type, date_gYear, date_gYearMonth, date_gMonthDay, duration_type]
 35
 36_formats = {
 37    date_gMonthDay:  ["%m-%d"],
 38    date_gYearMonth: ["%Y-%m"],
 39    date_gYear:      ["%Y"],
 40    date_type:       ["%Y-%m-%d", "%Y-%m-%dZ"],
 41    time_type:       ["%H:%M",
 42                      "%H:%M:%S",
 43                      "%H:%M:%SZ",
 44                      "%H:%M:%S.%f"],
 45    datetime_type:   ["%Y-%m-%dT%H:%M",
 46                      "%Y-%m-%dT%H:%M:%S",
 47                      "%Y-%m-%dT%H:%M:%S.%f",
 48                      "%Y-%m-%dT%H:%MZ",
 49                      "%Y-%m-%dT%H:%M:%SZ",
 50                      "%Y-%m-%dT%H:%M:%S.%fZ"],
 51    duration_type:   ["P%dD",
 52                      "P%YY%mM%dD",
 53                      "P%YY%mM",
 54                      "P%YY%dD",
 55                      "P%YY",
 56                      "P%mM",
 57                      "P%mM%dD"]
 58}
 59
 60_dur_times = ["%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS"]
 61
 62def _format_test(string):
 63    """
 64    Tests the string format to see whether it fits one of the time datatypes
 65    @param string: attribute value to test
 66    @return: a URI for the xsd datatype or the string 'plain'
 67    """
 68    # Try to get the easy cases:
 69    for key in _formats:
 70        for f in _formats[key]:
 71            try :
 72                # try to check if the syntax is fine
 73                _d = datetime.strptime(string, f)
 74                # bingo!
 75                return key
 76            except ValueError:
 77                pass
 78            
 79    # Now come the special cases:-(
 80    # Check first for the duration stuff, that is the nastiest.
 81    if len(string) > 2 and (string[0] == 'P' or (string [0] == '-' and string[1] == 'P')):
 82        # this is meant to be a duration type
 83        # first of all, get rid of the leading '-' and check again
 84        if string[0] == '-':
 85            for f in _formats[duration_type]:
 86                try :
 87                    # try to check if the syntax is fine
 88                    _d = datetime.strptime(string, f)
 89                    # bingo!
 90                    return duration_type
 91                except ValueError :
 92                    pass
 93        # Let us see if the value contains a separate time portion, and cut that one
 94        durs = string.split('T')
 95        if len(durs) == 2 :
 96            # yep, so we should check again
 97            dur = durs[0]
 98            tm  = durs[1]
 99            # Check the duration part
100            td = False
101            for f in _formats[duration_type] :
102                try :
103                    # try to check if the syntax is fine
104                    _d = datetime.strptime(dur, f)
105                    # bingo!
106                    td = True
107                    break
108                except ValueError :
109                    pass
110            if td == True :
111                # Getting there...
112                for f in _dur_times :
113                    try :
114                        # try to check if the syntax is fine
115                        _d = datetime.strptime(tm, f)
116                        # bingo!
117                        return duration_type
118                    except ValueError :
119                        pass
120            # something went wrong
121            return plain
122        else:
123            # Well, no more tricks, this is a plain type
124            return plain
125
126    # If we got here, we should check the time zone
127    # there is a discrepancy betwen the python and the HTML5/XSD lexical string,
128    # which means that this has to handled separately for the date and the timezone portion
129    try:
130        # The time-zone-less portion of the string
131        s = string[0:-6]
132        # The time-zone portion
133        tz = string[-5:]
134        try :
135            _t = datetime.strptime(tz,"%H:%M")
136        except ValueError :
137            # Bummer, this is not a correct time
138            return plain
139        # The time-zone is fine, the datetime portion has to be checked        
140        for f in _formats[datetime_type] :
141            try :
142                # try to check if it is fine
143                _d = datetime.strptime(s, f)
144                # Bingo!
145                return datetime_type
146            except ValueError :
147                pass
148    except :
149        pass
150    return plain
151
152def html5_extra_attributes(node, state):
153    """
154    @param node: the current node that could be modified
155    @param state: current state
156    @type state: L{Execution context<pyRdfa.state.ExecutionContext>}
157    """
158    def _get_literal(Pnode):
159        """
160        Get (recursively) the full text from a DOM Node.
161    
162        @param Pnode: DOM Node
163        @return: string
164        """
165        rc = ""
166        for node in Pnode.childNodes:
167            if node.nodeType == node.TEXT_NODE:
168                rc = rc + node.data
169            elif node.nodeType == node.ELEMENT_NODE:
170                rc = rc + _get_literal(node)
171        if state.options.space_preserve :
172            return rc
173        else :
174            return re.sub(r'(\r| |\n|\t)+'," ",rc).strip()
175        #return re.sub(r'(\r| |\n|\t)+',"",rc).strip()
176    # end _getLiteral
177
178    def _set_time(value) :
179        if not node.hasAttribute("datatype"):
180            # Check the datatype:
181            dt = _format_test(value)
182            if dt != plain:
183                node.setAttribute("datatype",dt)
184        # Finally, set the value itself
185        node.setAttribute("content",value)
186    # end _set_time
187
188    if not node.hasAttribute("content") :
189        # @content has top priority over the others...
190        if node.hasAttribute("datetime") :
191            _set_time( node.getAttribute("datetime") )
192        elif node.hasAttribute("dateTime") :
193            _set_time( node.getAttribute("dateTime") )
194        elif node.tagName == "time" :
195            # Note that a possible @datetime/@dateTime value has already been taken care of
196            _set_time( _get_literal(node) )
197        
198def remove_rel(node, _state):
199    """
200    If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value.
201    
202    @param node: the current node that could be modified
203    @param state: current state
204    @type state: L{Execution context<pyRdfa.state.ExecutionContext>}
205    """
206    from ..termorcurie import termname
207    def _massage_node(node,attr) :
208        """The real work for remove_rel is done here, parametrized with @rel and @rev"""
209        if node.hasAttribute("property") and node.hasAttribute(attr) :
210            vals = node.getAttribute(attr).strip().split()
211            if len(vals) != 0 :
212                final_vals = [ v for v in vals if not termname.match(v) ]
213                if len(final_vals) == 0 :
214                    node.removeAttribute(attr)
215                else :
216                    node.setAttribute(attr, reduce(lambda x,y: x+' '+y,final_vals))
217    
218    _massage_node(node, "rev")
219    _massage_node(node, "rel")
datetime_type = 'http://www.w3.org/2001/XMLSchema#dateTime'
time_type = 'http://www.w3.org/2001/XMLSchema#time'
date_type = 'http://www.w3.org/2001/XMLSchema#date'
date_gYear = 'http://www.w3.org/2001/XMLSchema#gYear'
date_gYearMonth = 'http://www.w3.org/2001/XMLSchema#gYearMonth'
date_gMonthDay = 'http://www.w3.org/2001/XMLSchema#gMonthDay'
duration_type = 'http://www.w3.org/2001/XMLSchema#duration'
plain = 'plain'
handled_time_types = ['http://www.w3.org/2001/XMLSchema#dateTime', 'http://www.w3.org/2001/XMLSchema#time', 'http://www.w3.org/2001/XMLSchema#date', 'http://www.w3.org/2001/XMLSchema#gYear', 'http://www.w3.org/2001/XMLSchema#gYearMonth', 'http://www.w3.org/2001/XMLSchema#gMonthDay', 'http://www.w3.org/2001/XMLSchema#duration']
def html5_extra_attributes(node, state):
153def html5_extra_attributes(node, state):
154    """
155    @param node: the current node that could be modified
156    @param state: current state
157    @type state: L{Execution context<pyRdfa.state.ExecutionContext>}
158    """
159    def _get_literal(Pnode):
160        """
161        Get (recursively) the full text from a DOM Node.
162    
163        @param Pnode: DOM Node
164        @return: string
165        """
166        rc = ""
167        for node in Pnode.childNodes:
168            if node.nodeType == node.TEXT_NODE:
169                rc = rc + node.data
170            elif node.nodeType == node.ELEMENT_NODE:
171                rc = rc + _get_literal(node)
172        if state.options.space_preserve :
173            return rc
174        else :
175            return re.sub(r'(\r| |\n|\t)+'," ",rc).strip()
176        #return re.sub(r'(\r| |\n|\t)+',"",rc).strip()
177    # end _getLiteral
178
179    def _set_time(value) :
180        if not node.hasAttribute("datatype"):
181            # Check the datatype:
182            dt = _format_test(value)
183            if dt != plain:
184                node.setAttribute("datatype",dt)
185        # Finally, set the value itself
186        node.setAttribute("content",value)
187    # end _set_time
188
189    if not node.hasAttribute("content") :
190        # @content has top priority over the others...
191        if node.hasAttribute("datetime") :
192            _set_time( node.getAttribute("datetime") )
193        elif node.hasAttribute("dateTime") :
194            _set_time( node.getAttribute("dateTime") )
195        elif node.tagName == "time" :
196            # Note that a possible @datetime/@dateTime value has already been taken care of
197            _set_time( _get_literal(node) )

@param node: the current node that could be modified @param state: current state @type state: L{Execution context<pyRdfa.state.ExecutionContext>}

def remove_rel(node, _state):
199def remove_rel(node, _state):
200    """
201    If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value.
202    
203    @param node: the current node that could be modified
204    @param state: current state
205    @type state: L{Execution context<pyRdfa.state.ExecutionContext>}
206    """
207    from ..termorcurie import termname
208    def _massage_node(node,attr) :
209        """The real work for remove_rel is done here, parametrized with @rel and @rev"""
210        if node.hasAttribute("property") and node.hasAttribute(attr) :
211            vals = node.getAttribute(attr).strip().split()
212            if len(vals) != 0 :
213                final_vals = [ v for v in vals if not termname.match(v) ]
214                if len(final_vals) == 0 :
215                    node.removeAttribute(attr)
216                else :
217                    node.setAttribute(attr, reduce(lambda x,y: x+' '+y,final_vals))
218    
219    _massage_node(node, "rev")
220    _massage_node(node, "rel")

If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value.

@param node: the current node that could be modified @param state: current state @type state: L{Execution context<pyRdfa.state.ExecutionContext>}