pyRdfa.host.html5
Simple transfomer for HTML5: add a @src for any @data, add a @content for the @value attribute of the element, and interpret the
@summary: Add a top "about" to
and @requires: U{RDFLib packagehttp://rdflib.net} @organization: U{World Wide Web Consortiumhttp://www.w3.org} @author: U{Ivan Herman} @license: This software is available for use under the U{W3C® SOFTWARE NOTICE AND LICENSE1# -*- coding: utf-8 -*- 2""" 3Simple transfomer for HTML5: add a @src for any @data, add a @content for the @value attribute of the <data> element, and interpret the <time> element. 4 5@summary: Add a top "about" to <head> and <body> 6@requires: U{RDFLib package<http://rdflib.net>} 7@organization: U{World Wide Web Consortium<http://www.w3.org>} 8@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} 9@license: This software is available for use under the 10U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} 11@contact: Ivan Herman, ivan@w3.org 12""" 13 14""" 15$Id: html5.py,v 1.15 2014-11-04 13:18:48 ivan Exp $ 16$Date: 2014-11-04 13:18:48 $ 17""" 18 19from functools import reduce 20 21# The handling of datatime is a little bit more complex... better put this in a separate function for a better management 22from datetime import datetime 23import re 24 25datetime_type = "http://www.w3.org/2001/XMLSchema#dateTime" 26time_type = "http://www.w3.org/2001/XMLSchema#time" 27date_type = "http://www.w3.org/2001/XMLSchema#date" 28date_gYear = "http://www.w3.org/2001/XMLSchema#gYear" 29date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth" 30date_gMonthDay = "http://www.w3.org/2001/XMLSchema#gMonthDay" 31duration_type = "http://www.w3.org/2001/XMLSchema#duration" 32plain = "plain" 33 34handled_time_types = [datetime_type, time_type, date_type, date_gYear, date_gYearMonth, date_gMonthDay, duration_type] 35 36_formats = { 37 date_gMonthDay: ["%m-%d"], 38 date_gYearMonth: ["%Y-%m"], 39 date_gYear: ["%Y"], 40 date_type: ["%Y-%m-%d", "%Y-%m-%dZ"], 41 time_type: ["%H:%M", 42 "%H:%M:%S", 43 "%H:%M:%SZ", 44 "%H:%M:%S.%f"], 45 datetime_type: ["%Y-%m-%dT%H:%M", 46 "%Y-%m-%dT%H:%M:%S", 47 "%Y-%m-%dT%H:%M:%S.%f", 48 "%Y-%m-%dT%H:%MZ", 49 "%Y-%m-%dT%H:%M:%SZ", 50 "%Y-%m-%dT%H:%M:%S.%fZ"], 51 duration_type: ["P%dD", 52 "P%YY%mM%dD", 53 "P%YY%mM", 54 "P%YY%dD", 55 "P%YY", 56 "P%mM", 57 "P%mM%dD"] 58} 59 60_dur_times = ["%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS"] 61 62def _format_test(string): 63 """ 64 Tests the string format to see whether it fits one of the time datatypes 65 @param string: attribute value to test 66 @return: a URI for the xsd datatype or the string 'plain' 67 """ 68 # Try to get the easy cases: 69 for key in _formats: 70 for f in _formats[key]: 71 try : 72 # try to check if the syntax is fine 73 _d = datetime.strptime(string, f) 74 # bingo! 75 return key 76 except ValueError: 77 pass 78 79 # Now come the special cases:-( 80 # Check first for the duration stuff, that is the nastiest. 81 if len(string) > 2 and (string[0] == 'P' or (string [0] == '-' and string[1] == 'P')): 82 # this is meant to be a duration type 83 # first of all, get rid of the leading '-' and check again 84 if string[0] == '-': 85 for f in _formats[duration_type]: 86 try : 87 # try to check if the syntax is fine 88 _d = datetime.strptime(string, f) 89 # bingo! 90 return duration_type 91 except ValueError : 92 pass 93 # Let us see if the value contains a separate time portion, and cut that one 94 durs = string.split('T') 95 if len(durs) == 2 : 96 # yep, so we should check again 97 dur = durs[0] 98 tm = durs[1] 99 # Check the duration part 100 td = False 101 for f in _formats[duration_type] : 102 try : 103 # try to check if the syntax is fine 104 _d = datetime.strptime(dur, f) 105 # bingo! 106 td = True 107 break 108 except ValueError : 109 pass 110 if td == True : 111 # Getting there... 112 for f in _dur_times : 113 try : 114 # try to check if the syntax is fine 115 _d = datetime.strptime(tm, f) 116 # bingo! 117 return duration_type 118 except ValueError : 119 pass 120 # something went wrong 121 return plain 122 else: 123 # Well, no more tricks, this is a plain type 124 return plain 125 126 # If we got here, we should check the time zone 127 # there is a discrepancy betwen the python and the HTML5/XSD lexical string, 128 # which means that this has to handled separately for the date and the timezone portion 129 try: 130 # The time-zone-less portion of the string 131 s = string[0:-6] 132 # The time-zone portion 133 tz = string[-5:] 134 try : 135 _t = datetime.strptime(tz,"%H:%M") 136 except ValueError : 137 # Bummer, this is not a correct time 138 return plain 139 # The time-zone is fine, the datetime portion has to be checked 140 for f in _formats[datetime_type] : 141 try : 142 # try to check if it is fine 143 _d = datetime.strptime(s, f) 144 # Bingo! 145 return datetime_type 146 except ValueError : 147 pass 148 except : 149 pass 150 return plain 151 152def html5_extra_attributes(node, state): 153 """ 154 @param node: the current node that could be modified 155 @param state: current state 156 @type state: L{Execution context<pyRdfa.state.ExecutionContext>} 157 """ 158 def _get_literal(Pnode): 159 """ 160 Get (recursively) the full text from a DOM Node. 161 162 @param Pnode: DOM Node 163 @return: string 164 """ 165 rc = "" 166 for node in Pnode.childNodes: 167 if node.nodeType == node.TEXT_NODE: 168 rc = rc + node.data 169 elif node.nodeType == node.ELEMENT_NODE: 170 rc = rc + _get_literal(node) 171 if state.options.space_preserve : 172 return rc 173 else : 174 return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() 175 #return re.sub(r'(\r| |\n|\t)+',"",rc).strip() 176 # end _getLiteral 177 178 def _set_time(value) : 179 if not node.hasAttribute("datatype"): 180 # Check the datatype: 181 dt = _format_test(value) 182 if dt != plain: 183 node.setAttribute("datatype",dt) 184 # Finally, set the value itself 185 node.setAttribute("content",value) 186 # end _set_time 187 188 if not node.hasAttribute("content") : 189 # @content has top priority over the others... 190 if node.hasAttribute("datetime") : 191 _set_time( node.getAttribute("datetime") ) 192 elif node.hasAttribute("dateTime") : 193 _set_time( node.getAttribute("dateTime") ) 194 elif node.tagName == "time" : 195 # Note that a possible @datetime/@dateTime value has already been taken care of 196 _set_time( _get_literal(node) ) 197 198def remove_rel(node, _state): 199 """ 200 If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value. 201 202 @param node: the current node that could be modified 203 @param state: current state 204 @type state: L{Execution context<pyRdfa.state.ExecutionContext>} 205 """ 206 from ..termorcurie import termname 207 def _massage_node(node,attr) : 208 """The real work for remove_rel is done here, parametrized with @rel and @rev""" 209 if node.hasAttribute("property") and node.hasAttribute(attr) : 210 vals = node.getAttribute(attr).strip().split() 211 if len(vals) != 0 : 212 final_vals = [ v for v in vals if not termname.match(v) ] 213 if len(final_vals) == 0 : 214 node.removeAttribute(attr) 215 else : 216 node.setAttribute(attr, reduce(lambda x,y: x+' '+y,final_vals)) 217 218 _massage_node(node, "rev") 219 _massage_node(node, "rel")
datetime_type =
'http://www.w3.org/2001/XMLSchema#dateTime'
time_type =
'http://www.w3.org/2001/XMLSchema#time'
date_type =
'http://www.w3.org/2001/XMLSchema#date'
date_gYear =
'http://www.w3.org/2001/XMLSchema#gYear'
date_gYearMonth =
'http://www.w3.org/2001/XMLSchema#gYearMonth'
date_gMonthDay =
'http://www.w3.org/2001/XMLSchema#gMonthDay'
duration_type =
'http://www.w3.org/2001/XMLSchema#duration'
plain =
'plain'
handled_time_types =
['http://www.w3.org/2001/XMLSchema#dateTime', 'http://www.w3.org/2001/XMLSchema#time', 'http://www.w3.org/2001/XMLSchema#date', 'http://www.w3.org/2001/XMLSchema#gYear', 'http://www.w3.org/2001/XMLSchema#gYearMonth', 'http://www.w3.org/2001/XMLSchema#gMonthDay', 'http://www.w3.org/2001/XMLSchema#duration']
def
html5_extra_attributes(node, state):
153def html5_extra_attributes(node, state): 154 """ 155 @param node: the current node that could be modified 156 @param state: current state 157 @type state: L{Execution context<pyRdfa.state.ExecutionContext>} 158 """ 159 def _get_literal(Pnode): 160 """ 161 Get (recursively) the full text from a DOM Node. 162 163 @param Pnode: DOM Node 164 @return: string 165 """ 166 rc = "" 167 for node in Pnode.childNodes: 168 if node.nodeType == node.TEXT_NODE: 169 rc = rc + node.data 170 elif node.nodeType == node.ELEMENT_NODE: 171 rc = rc + _get_literal(node) 172 if state.options.space_preserve : 173 return rc 174 else : 175 return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() 176 #return re.sub(r'(\r| |\n|\t)+',"",rc).strip() 177 # end _getLiteral 178 179 def _set_time(value) : 180 if not node.hasAttribute("datatype"): 181 # Check the datatype: 182 dt = _format_test(value) 183 if dt != plain: 184 node.setAttribute("datatype",dt) 185 # Finally, set the value itself 186 node.setAttribute("content",value) 187 # end _set_time 188 189 if not node.hasAttribute("content") : 190 # @content has top priority over the others... 191 if node.hasAttribute("datetime") : 192 _set_time( node.getAttribute("datetime") ) 193 elif node.hasAttribute("dateTime") : 194 _set_time( node.getAttribute("dateTime") ) 195 elif node.tagName == "time" : 196 # Note that a possible @datetime/@dateTime value has already been taken care of 197 _set_time( _get_literal(node) )
@param node: the current node that could be modified @param state: current state @type state: L{Execution context<pyRdfa.state.ExecutionContext>}
def
remove_rel(node, _state):
199def remove_rel(node, _state): 200 """ 201 If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value. 202 203 @param node: the current node that could be modified 204 @param state: current state 205 @type state: L{Execution context<pyRdfa.state.ExecutionContext>} 206 """ 207 from ..termorcurie import termname 208 def _massage_node(node,attr) : 209 """The real work for remove_rel is done here, parametrized with @rel and @rev""" 210 if node.hasAttribute("property") and node.hasAttribute(attr) : 211 vals = node.getAttribute(attr).strip().split() 212 if len(vals) != 0 : 213 final_vals = [ v for v in vals if not termname.match(v) ] 214 if len(final_vals) == 0 : 215 node.removeAttribute(attr) 216 else : 217 node.setAttribute(attr, reduce(lambda x,y: x+' '+y,final_vals)) 218 219 _massage_node(node, "rev") 220 _massage_node(node, "rel")
If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value.
@param node: the current node that could be modified @param state: current state @type state: L{Execution context<pyRdfa.state.ExecutionContext>}