pyRdfa.utils
Various utilities for pyRdfa.
Most of the utilities are straightforward.
@organization: U{World Wide Web Consortiumhttp://www.w3.org}
@author: U{Ivan Herman}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE
1# -*- coding: utf-8 -*- 2""" 3Various utilities for pyRdfa. 4 5Most of the utilities are straightforward. 6 7@organization: U{World Wide Web Consortium<http://www.w3.org>} 8@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} 9@license: This software is available for use under the 10U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} 11 12 13""" 14 15""" 16$Id: utils.py,v 1.18 2016/12/08 10:13:34 ivan Exp $ 17$Date: 2016/12/08 10:13:34 $ 18""" 19import sys, datetime 20 21from urllib.parse import urljoin, quote 22from http.server import BaseHTTPRequestHandler 23from urllib.error import HTTPError as urllib_HTTPError 24 25from .extras.httpheader import content_type, parse_http_datetime 26 27 28from .host import preferred_suffixes 29 30######################################################################################################### 31# Handling URIs 32class URIOpener: 33 """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class 34 sets a number of instance variable that might be relevant for processing. 35 The class also adds an accept header to the outgoing request, namely 36 text/html and application/xhtml+xml (unless set explicitly by the caller). 37 38 If the content type is set by the server, the relevant HTTP response field is used. Otherwise, 39 common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance 40 for C{file:///} URI-s). If none of these works, the content type is empty. 41 42 Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}. 43 44 @ivar data: the real data, ie, a file-like object 45 @ivar headers: the return headers as sent back by the server 46 @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined 47 @ivar location: the real location of the data (ie, after possible redirection and content negotiation) 48 @ivar last_modified_date: sets the last modified date if set in the header, None otherwise 49 @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting) 50 """ 51 CONTENT_LOCATION = 'Content-Location' 52 CONTENT_TYPE = 'Content-Type' 53 LAST_MODIFIED = 'Last-Modified' 54 EXPIRES = 'Expires' 55 56 def __init__(self, name, additional_headers={}, verify=True): 57 """ 58 @param name: URL to be opened 59 @keyword additional_headers: additional HTTP request headers to be added to the call 60 """ 61 try: 62 # Note the removal of the fragment ID. This is necessary, per the HTTP spec 63 url = name.split('#')[0] 64 if 'Accept' not in additional_headers: 65 additional_headers['Accept'] = 'text/html, application/xhtml+xml' 66 67 import requests 68 # For security reason certificate verification is now done by default. But, can be 69 # disabled for sites still go wrong because the cerficates are not o.k. with request... 70 r = requests.get(url, headers=additional_headers, verify=verify) 71 self.data = r.content 72 self.headers = r.headers 73 74 if URIOpener.CONTENT_TYPE in self.headers: 75 # The call below will remove the possible media type parameters, like charset settings 76 ct = content_type(self.headers[URIOpener.CONTENT_TYPE]) 77 self.content_type = ct.media_type 78 if 'charset' in ct.parmdict: 79 self.charset = ct.parmdict['charset'] 80 else: 81 self.charset = None 82 # print 83 else: 84 # check if the suffix can be used for the content type; this may be important 85 # for file:// type URI or if the server is not properly set up to return the right 86 # mime type 87 self.charset = None 88 self.content_type = "" 89 for suffix in preferred_suffixes.keys(): 90 if name.endswith(suffix): 91 self.content_type = preferred_suffixes[suffix] 92 break 93 94 if URIOpener.CONTENT_LOCATION in self.headers: 95 self.location = urljoin(r.url, self.headers[URIOpener.CONTENT_LOCATION]) 96 else: 97 self.location = name 98 99 self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1) 100 if URIOpener.EXPIRES in self.headers: 101 try: 102 # Thanks to Deron Meranda for the HTTP date conversion method... 103 self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES]) 104 except: 105 # The Expires date format was wrong, sorry, forget it... 106 pass 107 108 self.last_modified_date = None 109 if URIOpener.LAST_MODIFIED in self.headers: 110 try: 111 # Thanks to Deron Meranda for the HTTP date conversion method... 112 self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED]) 113 except: 114 # The last modified date format was wrong, sorry, forget it... 115 pass 116 117 except urllib_HTTPError: 118 e = sys.exc_info()[1] 119 from . import HTTPError 120 msg = BaseHTTPRequestHandler.responses[e.code] 121 raise HTTPError('%s' % msg[1], e.code) 122 except Exception: 123 e = sys.exc_info()[1] 124 from . import RDFaError 125 raise RDFaError('%s' % e) 126 127######################################################################################################### 128 129# 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other 130# special characters are converted to their %.. equivalents for namespace prefixes 131_unquotedChars = r':/\?=#~' 132_warnChars = [' ','\n','\r','\t'] 133 134def quote_URI(uri, options=None): 135 """ 136 'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters 137 may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} 138 is also in the uri, an extra warning is also generated. 139 @param uri: URI 140 @param options: 141 @type options: L{Options<pyRdfa.Options>} 142 """ 143 from . import err_unusual_char_in_URI 144 suri = uri.strip() 145 for c in _warnChars: 146 if suri.find(c) != -1: 147 if options != None: 148 options.add_warning(err_unusual_char_in_URI % suri) 149 break 150 return quote(suri, _unquotedChars) 151 152######################################################################################################### 153 154def create_file_name(uri): 155 """ 156 Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file. 157 """ 158 suri = uri.strip() 159 final_uri = quote(suri,_unquotedChars) 160 # Remove some potentially dangereous characters 161 return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_') 162 163######################################################################################################### 164def has_one_of_attributes(node,*args): 165 """ 166 Check whether one of the listed attributes is present on a (DOM) node. 167 @param node: DOM element node 168 @param args: possible attribute names 169 @return: True or False 170 @rtype: Boolean 171 """ 172 if len(args) == 0: 173 return None 174 if isinstance(args[0], tuple) or isinstance(args[0], list): 175 rargs = args[0] 176 else: 177 rargs = args 178 179 return True in [ node.hasAttribute(attr) for attr in rargs ] 180 181######################################################################################################### 182def traverse_tree(node, func): 183 """Traverse the whole element tree, and perform the function C{func} on all the elements. 184 @param node: DOM element node 185 @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped. 186 """ 187 if func(node): 188 return 189 190 for n in node.childNodes: 191 if n.nodeType == node.ELEMENT_NODE: 192 traverse_tree(n, func) 193 194######################################################################################################### 195def return_XML(state, inode, base=True, xmlns=True): 196 """ 197 Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done 198 via a C{node.toxml} call of the xml minidom implementation.) 199 200 @param inode: DOM Node 201 @param state: L{pyRdfa.state.ExecutionContext} 202 @param base: whether the base element should be added to the output 203 @type base: Boolean 204 @param xmlns: whether the namespace declarations should be repeated in the generated node 205 @type xmlns: Boolean 206 @return: string 207 """ 208 node = inode.cloneNode(True) 209 # Decorate the element with namespaces value and, optionally, base 210 if base: 211 node.setAttribute("xml:base",state.base) 212 if xmlns: 213 for prefix in state.term_or_curie.xmlns: 214 if not node.hasAttribute("xmlns:%s" % prefix): 215 node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix]) 216 # Set the default namespace, if not done (and is available) 217 if not node.getAttribute("xmlns") and state.defaultNS != None: 218 node.setAttribute("xmlns", state.defaultNS) 219 return node.toxml() 220 221######################################################################################################### 222 223def dump(node): 224 """ 225 This is just for debug purposes: it prints the essential content of the node in the tree starting at node. 226 227 @param node: DOM node 228 """ 229 print(node.toprettyxml(indent="", newl="")) 230
33class URIOpener: 34 """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class 35 sets a number of instance variable that might be relevant for processing. 36 The class also adds an accept header to the outgoing request, namely 37 text/html and application/xhtml+xml (unless set explicitly by the caller). 38 39 If the content type is set by the server, the relevant HTTP response field is used. Otherwise, 40 common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance 41 for C{file:///} URI-s). If none of these works, the content type is empty. 42 43 Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}. 44 45 @ivar data: the real data, ie, a file-like object 46 @ivar headers: the return headers as sent back by the server 47 @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined 48 @ivar location: the real location of the data (ie, after possible redirection and content negotiation) 49 @ivar last_modified_date: sets the last modified date if set in the header, None otherwise 50 @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting) 51 """ 52 CONTENT_LOCATION = 'Content-Location' 53 CONTENT_TYPE = 'Content-Type' 54 LAST_MODIFIED = 'Last-Modified' 55 EXPIRES = 'Expires' 56 57 def __init__(self, name, additional_headers={}, verify=True): 58 """ 59 @param name: URL to be opened 60 @keyword additional_headers: additional HTTP request headers to be added to the call 61 """ 62 try: 63 # Note the removal of the fragment ID. This is necessary, per the HTTP spec 64 url = name.split('#')[0] 65 if 'Accept' not in additional_headers: 66 additional_headers['Accept'] = 'text/html, application/xhtml+xml' 67 68 import requests 69 # For security reason certificate verification is now done by default. But, can be 70 # disabled for sites still go wrong because the cerficates are not o.k. with request... 71 r = requests.get(url, headers=additional_headers, verify=verify) 72 self.data = r.content 73 self.headers = r.headers 74 75 if URIOpener.CONTENT_TYPE in self.headers: 76 # The call below will remove the possible media type parameters, like charset settings 77 ct = content_type(self.headers[URIOpener.CONTENT_TYPE]) 78 self.content_type = ct.media_type 79 if 'charset' in ct.parmdict: 80 self.charset = ct.parmdict['charset'] 81 else: 82 self.charset = None 83 # print 84 else: 85 # check if the suffix can be used for the content type; this may be important 86 # for file:// type URI or if the server is not properly set up to return the right 87 # mime type 88 self.charset = None 89 self.content_type = "" 90 for suffix in preferred_suffixes.keys(): 91 if name.endswith(suffix): 92 self.content_type = preferred_suffixes[suffix] 93 break 94 95 if URIOpener.CONTENT_LOCATION in self.headers: 96 self.location = urljoin(r.url, self.headers[URIOpener.CONTENT_LOCATION]) 97 else: 98 self.location = name 99 100 self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1) 101 if URIOpener.EXPIRES in self.headers: 102 try: 103 # Thanks to Deron Meranda for the HTTP date conversion method... 104 self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES]) 105 except: 106 # The Expires date format was wrong, sorry, forget it... 107 pass 108 109 self.last_modified_date = None 110 if URIOpener.LAST_MODIFIED in self.headers: 111 try: 112 # Thanks to Deron Meranda for the HTTP date conversion method... 113 self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED]) 114 except: 115 # The last modified date format was wrong, sorry, forget it... 116 pass 117 118 except urllib_HTTPError: 119 e = sys.exc_info()[1] 120 from . import HTTPError 121 msg = BaseHTTPRequestHandler.responses[e.code] 122 raise HTTPError('%s' % msg[1], e.code) 123 except Exception: 124 e = sys.exc_info()[1] 125 from . import RDFaError 126 raise RDFaError('%s' % e)
A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class sets a number of instance variable that might be relevant for processing. The class also adds an accept header to the outgoing request, namely text/html and application/xhtml+xml (unless set explicitly by the caller).
If the content type is set by the server, the relevant HTTP response field is used. Otherwise, common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance for C{file:///} URI-s). If none of these works, the content type is empty.
Interpretation of the content type for the return is done by Deron Meranda's U{httpheader modulehttp://deron.meranda.us/}.
@ivar data: the real data, ie, a file-like object @ivar headers: the return headers as sent back by the server @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined @ivar location: the real location of the data (ie, after possible redirection and content negotiation) @ivar last_modified_date: sets the last modified date if set in the header, None otherwise @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting)
57 def __init__(self, name, additional_headers={}, verify=True): 58 """ 59 @param name: URL to be opened 60 @keyword additional_headers: additional HTTP request headers to be added to the call 61 """ 62 try: 63 # Note the removal of the fragment ID. This is necessary, per the HTTP spec 64 url = name.split('#')[0] 65 if 'Accept' not in additional_headers: 66 additional_headers['Accept'] = 'text/html, application/xhtml+xml' 67 68 import requests 69 # For security reason certificate verification is now done by default. But, can be 70 # disabled for sites still go wrong because the cerficates are not o.k. with request... 71 r = requests.get(url, headers=additional_headers, verify=verify) 72 self.data = r.content 73 self.headers = r.headers 74 75 if URIOpener.CONTENT_TYPE in self.headers: 76 # The call below will remove the possible media type parameters, like charset settings 77 ct = content_type(self.headers[URIOpener.CONTENT_TYPE]) 78 self.content_type = ct.media_type 79 if 'charset' in ct.parmdict: 80 self.charset = ct.parmdict['charset'] 81 else: 82 self.charset = None 83 # print 84 else: 85 # check if the suffix can be used for the content type; this may be important 86 # for file:// type URI or if the server is not properly set up to return the right 87 # mime type 88 self.charset = None 89 self.content_type = "" 90 for suffix in preferred_suffixes.keys(): 91 if name.endswith(suffix): 92 self.content_type = preferred_suffixes[suffix] 93 break 94 95 if URIOpener.CONTENT_LOCATION in self.headers: 96 self.location = urljoin(r.url, self.headers[URIOpener.CONTENT_LOCATION]) 97 else: 98 self.location = name 99 100 self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1) 101 if URIOpener.EXPIRES in self.headers: 102 try: 103 # Thanks to Deron Meranda for the HTTP date conversion method... 104 self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES]) 105 except: 106 # The Expires date format was wrong, sorry, forget it... 107 pass 108 109 self.last_modified_date = None 110 if URIOpener.LAST_MODIFIED in self.headers: 111 try: 112 # Thanks to Deron Meranda for the HTTP date conversion method... 113 self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED]) 114 except: 115 # The last modified date format was wrong, sorry, forget it... 116 pass 117 118 except urllib_HTTPError: 119 e = sys.exc_info()[1] 120 from . import HTTPError 121 msg = BaseHTTPRequestHandler.responses[e.code] 122 raise HTTPError('%s' % msg[1], e.code) 123 except Exception: 124 e = sys.exc_info()[1] 125 from . import RDFaError 126 raise RDFaError('%s' % e)
@param name: URL to be opened @keyword additional_headers: additional HTTP request headers to be added to the call
135def quote_URI(uri, options=None): 136 """ 137 'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters 138 may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} 139 is also in the uri, an extra warning is also generated. 140 @param uri: URI 141 @param options: 142 @type options: L{Options<pyRdfa.Options>} 143 """ 144 from . import err_unusual_char_in_URI 145 suri = uri.strip() 146 for c in _warnChars: 147 if suri.find(c) != -1: 148 if options != None: 149 options.add_warning(err_unusual_char_in_URI % suri) 150 break 151 return quote(suri, _unquotedChars)
'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters
may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars}
is also in the uri, an extra warning is also generated.
@param uri: URI
@param options:
@type options: L{Options
155def create_file_name(uri): 156 """ 157 Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file. 158 """ 159 suri = uri.strip() 160 final_uri = quote(suri,_unquotedChars) 161 # Remove some potentially dangereous characters 162 return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_')
Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file.
165def has_one_of_attributes(node,*args): 166 """ 167 Check whether one of the listed attributes is present on a (DOM) node. 168 @param node: DOM element node 169 @param args: possible attribute names 170 @return: True or False 171 @rtype: Boolean 172 """ 173 if len(args) == 0: 174 return None 175 if isinstance(args[0], tuple) or isinstance(args[0], list): 176 rargs = args[0] 177 else: 178 rargs = args 179 180 return True in [ node.hasAttribute(attr) for attr in rargs ]
Check whether one of the listed attributes is present on a (DOM) node. @param node: DOM element node @param args: possible attribute names @return: True or False @rtype: Boolean
183def traverse_tree(node, func): 184 """Traverse the whole element tree, and perform the function C{func} on all the elements. 185 @param node: DOM element node 186 @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped. 187 """ 188 if func(node): 189 return 190 191 for n in node.childNodes: 192 if n.nodeType == node.ELEMENT_NODE: 193 traverse_tree(n, func)
Traverse the whole element tree, and perform the function C{func} on all the elements. @param node: DOM element node @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped.
196def return_XML(state, inode, base=True, xmlns=True): 197 """ 198 Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done 199 via a C{node.toxml} call of the xml minidom implementation.) 200 201 @param inode: DOM Node 202 @param state: L{pyRdfa.state.ExecutionContext} 203 @param base: whether the base element should be added to the output 204 @type base: Boolean 205 @param xmlns: whether the namespace declarations should be repeated in the generated node 206 @type xmlns: Boolean 207 @return: string 208 """ 209 node = inode.cloneNode(True) 210 # Decorate the element with namespaces value and, optionally, base 211 if base: 212 node.setAttribute("xml:base",state.base) 213 if xmlns: 214 for prefix in state.term_or_curie.xmlns: 215 if not node.hasAttribute("xmlns:%s" % prefix): 216 node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix]) 217 # Set the default namespace, if not done (and is available) 218 if not node.getAttribute("xmlns") and state.defaultNS != None: 219 node.setAttribute("xmlns", state.defaultNS) 220 return node.toxml()
Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done via a C{node.toxml} call of the xml minidom implementation.)
@param inode: DOM Node @param state: L{pyRdfa.state.ExecutionContext} @param base: whether the base element should be added to the output @type base: Boolean @param xmlns: whether the namespace declarations should be repeated in the generated node @type xmlns: Boolean @return: string
224def dump(node): 225 """ 226 This is just for debug purposes: it prints the essential content of the node in the tree starting at node. 227 228 @param node: DOM node 229 """ 230 print(node.toprettyxml(indent="", newl=""))
This is just for debug purposes: it prints the essential content of the node in the tree starting at node.
@param node: DOM node