pyRdfa.utils

Various utilities for pyRdfa.

Most of the utilities are straightforward.

@organization: U{World Wide Web Consortiumhttp://www.w3.org} @author: U{Ivan Herman} @license: This software is available for use under the U{W3C® SOFTWARE NOTICE AND LICENSE}

  1# -*- coding: utf-8 -*-
  2"""
  3Various utilities for pyRdfa.
  4
  5Most of the utilities are straightforward.
  6
  7@organization: U{World Wide Web Consortium<http://www.w3.org>}
  8@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
  9@license: This software is available for use under the
 10U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
 11
 12
 13"""
 14
 15"""
 16$Id: utils.py,v 1.18 2016/12/08 10:13:34 ivan Exp $
 17$Date: 2016/12/08 10:13:34 $
 18"""
 19import sys, datetime
 20
 21from urllib.parse import urljoin, quote
 22from http.server import BaseHTTPRequestHandler
 23from urllib.error import HTTPError as urllib_HTTPError
 24
 25from .extras.httpheader import content_type, parse_http_datetime
 26
 27
 28from .host import preferred_suffixes
 29
 30#########################################################################################################
 31# Handling URIs
 32class URIOpener:
 33    """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
 34    sets a number of instance variable that might be relevant for processing.
 35    The class also adds an accept header to the outgoing request, namely
 36    text/html and application/xhtml+xml (unless set explicitly by the caller).
 37    
 38    If the content type is set by the server, the relevant HTTP response field is used. Otherwise,
 39    common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance
 40    for C{file:///} URI-s). If none of these works, the content type is empty.
 41        
 42    Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}.
 43    
 44    @ivar data: the real data, ie, a file-like object
 45    @ivar headers: the return headers as sent back by the server
 46    @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined
 47    @ivar location: the real location of the data (ie, after possible redirection and content negotiation)
 48    @ivar last_modified_date: sets the last modified date if set in the header, None otherwise
 49    @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting)
 50    """
 51    CONTENT_LOCATION = 'Content-Location'
 52    CONTENT_TYPE =     'Content-Type'
 53    LAST_MODIFIED =    'Last-Modified'
 54    EXPIRES =          'Expires'
 55
 56    def __init__(self, name, additional_headers={}, verify=True):
 57        """
 58        @param name: URL to be opened
 59        @keyword additional_headers: additional HTTP request headers to be added to the call
 60        """        
 61        try:
 62            # Note the removal of the fragment ID. This is necessary, per the HTTP spec
 63            url = name.split('#')[0]
 64            if 'Accept' not in additional_headers:
 65                additional_headers['Accept'] = 'text/html, application/xhtml+xml'
 66                
 67            import requests
 68            # For security reason certificate verification is now done by default. But, can be
 69            # disabled for sites still go wrong because the cerficates are not o.k. with request...
 70            r = requests.get(url, headers=additional_headers, verify=verify)
 71            self.data = r.content
 72            self.headers = r.headers
 73            
 74            if URIOpener.CONTENT_TYPE in self.headers:
 75                # The call below will remove the possible media type parameters, like charset settings
 76                ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
 77                self.content_type = ct.media_type
 78                if 'charset' in ct.parmdict:
 79                    self.charset = ct.parmdict['charset']
 80                else:
 81                    self.charset = None
 82                # print
 83            else:
 84                # check if the suffix can be used for the content type; this may be important
 85                # for file:// type URI or if the server is not properly set up to return the right
 86                # mime type
 87                self.charset = None
 88                self.content_type = ""
 89                for suffix in preferred_suffixes.keys():
 90                    if name.endswith(suffix):
 91                        self.content_type = preferred_suffixes[suffix]
 92                        break
 93            
 94            if URIOpener.CONTENT_LOCATION in self.headers:
 95                self.location = urljoin(r.url, self.headers[URIOpener.CONTENT_LOCATION])
 96            else:
 97                self.location = name
 98            
 99            self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
100            if URIOpener.EXPIRES in self.headers:
101                try:
102                    # Thanks to Deron Meranda for the HTTP date conversion method...
103                    self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
104                except:
105                    # The Expires date format was wrong, sorry, forget it...
106                    pass
107
108            self.last_modified_date = None
109            if URIOpener.LAST_MODIFIED in self.headers:
110                try:
111                    # Thanks to Deron Meranda for the HTTP date conversion method...
112                    self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED])
113                except:
114                    # The last modified date format was wrong, sorry, forget it...
115                    pass
116                
117        except urllib_HTTPError:
118            e = sys.exc_info()[1]
119            from . import HTTPError
120            msg = BaseHTTPRequestHandler.responses[e.code]
121            raise HTTPError('%s' % msg[1], e.code)
122        except Exception:
123            e = sys.exc_info()[1]
124            from . import RDFaError
125            raise RDFaError('%s' % e)
126
127#########################################################################################################
128
129# 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other 
130# special characters are converted to their %.. equivalents for namespace prefixes
131_unquotedChars = ':/\?=#~'
132_warnChars = [' ','\n','\r','\t']
133
134def quote_URI(uri, options=None):
135    """
136    'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters
137    may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} 
138    is also in the uri, an extra warning is also generated.
139    @param uri: URI
140    @param options: 
141    @type options: L{Options<pyRdfa.Options>}
142    """
143    from . import err_unusual_char_in_URI
144    suri = uri.strip()
145    for c in _warnChars:
146        if suri.find(c) != -1:
147            if options != None:
148                options.add_warning(err_unusual_char_in_URI % suri)
149            break
150    return quote(suri, _unquotedChars)
151    
152#########################################################################################################
153    
154def create_file_name(uri):
155    """
156    Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file.
157    """
158    suri = uri.strip()
159    final_uri = quote(suri,_unquotedChars)
160    # Remove some potentially dangereous characters
161    return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_')
162
163#########################################################################################################
164def has_one_of_attributes(node,*args):
165    """
166    Check whether one of the listed attributes is present on a (DOM) node.
167    @param node: DOM element node
168    @param args: possible attribute names
169    @return: True or False
170    @rtype: Boolean
171    """
172    if len(args) == 0:
173        return None
174    if isinstance(args[0], tuple) or isinstance(args[0], list):
175        rargs = args[0]
176    else:
177        rargs = args
178    
179    return True in [ node.hasAttribute(attr) for attr in rargs ]
180
181#########################################################################################################
182def traverse_tree(node, func):
183    """Traverse the whole element tree, and perform the function C{func} on all the elements.
184    @param node: DOM element node
185    @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped.
186    """
187    if func(node):
188        return
189
190    for n in node.childNodes:
191        if n.nodeType == node.ELEMENT_NODE:
192            traverse_tree(n, func)
193
194#########################################################################################################
195def return_XML(state, inode, base=True, xmlns=True):
196    """
197    Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done
198    via a C{node.toxml} call of the xml minidom implementation.)
199
200    @param inode: DOM Node
201    @param state: L{pyRdfa.state.ExecutionContext}
202    @param base: whether the base element should be added to the output
203    @type base: Boolean
204    @param xmlns: whether the namespace declarations should be repeated in the generated node
205    @type xmlns: Boolean
206    @return: string
207    """
208    node = inode.cloneNode(True)
209    # Decorate the element with namespaces value and, optionally, base
210    if base:
211        node.setAttribute("xml:base",state.base)
212    if xmlns:
213        for prefix in state.term_or_curie.xmlns:
214            if not node.hasAttribute("xmlns:%s" % prefix):
215                node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix])
216        # Set the default namespace, if not done (and is available)
217        if not node.getAttribute("xmlns") and state.defaultNS != None:
218            node.setAttribute("xmlns", state.defaultNS)
219    return node.toxml()
220
221#########################################################################################################
222
223def dump(node):
224    """
225    This is just for debug purposes: it prints the essential content of the node in the tree starting at node.
226
227    @param node: DOM node
228    """
229    print(node.toprettyxml(indent="", newl=""))
230    
class URIOpener:
 33class URIOpener:
 34    """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
 35    sets a number of instance variable that might be relevant for processing.
 36    The class also adds an accept header to the outgoing request, namely
 37    text/html and application/xhtml+xml (unless set explicitly by the caller).
 38    
 39    If the content type is set by the server, the relevant HTTP response field is used. Otherwise,
 40    common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance
 41    for C{file:///} URI-s). If none of these works, the content type is empty.
 42        
 43    Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}.
 44    
 45    @ivar data: the real data, ie, a file-like object
 46    @ivar headers: the return headers as sent back by the server
 47    @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined
 48    @ivar location: the real location of the data (ie, after possible redirection and content negotiation)
 49    @ivar last_modified_date: sets the last modified date if set in the header, None otherwise
 50    @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting)
 51    """
 52    CONTENT_LOCATION = 'Content-Location'
 53    CONTENT_TYPE =     'Content-Type'
 54    LAST_MODIFIED =    'Last-Modified'
 55    EXPIRES =          'Expires'
 56
 57    def __init__(self, name, additional_headers={}, verify=True):
 58        """
 59        @param name: URL to be opened
 60        @keyword additional_headers: additional HTTP request headers to be added to the call
 61        """        
 62        try:
 63            # Note the removal of the fragment ID. This is necessary, per the HTTP spec
 64            url = name.split('#')[0]
 65            if 'Accept' not in additional_headers:
 66                additional_headers['Accept'] = 'text/html, application/xhtml+xml'
 67                
 68            import requests
 69            # For security reason certificate verification is now done by default. But, can be
 70            # disabled for sites still go wrong because the cerficates are not o.k. with request...
 71            r = requests.get(url, headers=additional_headers, verify=verify)
 72            self.data = r.content
 73            self.headers = r.headers
 74            
 75            if URIOpener.CONTENT_TYPE in self.headers:
 76                # The call below will remove the possible media type parameters, like charset settings
 77                ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
 78                self.content_type = ct.media_type
 79                if 'charset' in ct.parmdict:
 80                    self.charset = ct.parmdict['charset']
 81                else:
 82                    self.charset = None
 83                # print
 84            else:
 85                # check if the suffix can be used for the content type; this may be important
 86                # for file:// type URI or if the server is not properly set up to return the right
 87                # mime type
 88                self.charset = None
 89                self.content_type = ""
 90                for suffix in preferred_suffixes.keys():
 91                    if name.endswith(suffix):
 92                        self.content_type = preferred_suffixes[suffix]
 93                        break
 94            
 95            if URIOpener.CONTENT_LOCATION in self.headers:
 96                self.location = urljoin(r.url, self.headers[URIOpener.CONTENT_LOCATION])
 97            else:
 98                self.location = name
 99            
100            self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
101            if URIOpener.EXPIRES in self.headers:
102                try:
103                    # Thanks to Deron Meranda for the HTTP date conversion method...
104                    self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
105                except:
106                    # The Expires date format was wrong, sorry, forget it...
107                    pass
108
109            self.last_modified_date = None
110            if URIOpener.LAST_MODIFIED in self.headers:
111                try:
112                    # Thanks to Deron Meranda for the HTTP date conversion method...
113                    self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED])
114                except:
115                    # The last modified date format was wrong, sorry, forget it...
116                    pass
117                
118        except urllib_HTTPError:
119            e = sys.exc_info()[1]
120            from . import HTTPError
121            msg = BaseHTTPRequestHandler.responses[e.code]
122            raise HTTPError('%s' % msg[1], e.code)
123        except Exception:
124            e = sys.exc_info()[1]
125            from . import RDFaError
126            raise RDFaError('%s' % e)

A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class sets a number of instance variable that might be relevant for processing. The class also adds an accept header to the outgoing request, namely text/html and application/xhtml+xml (unless set explicitly by the caller).

If the content type is set by the server, the relevant HTTP response field is used. Otherwise, common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance for C{file:///} URI-s). If none of these works, the content type is empty.

Interpretation of the content type for the return is done by Deron Meranda's U{httpheader modulehttp://deron.meranda.us/}.

@ivar data: the real data, ie, a file-like object @ivar headers: the return headers as sent back by the server @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined @ivar location: the real location of the data (ie, after possible redirection and content negotiation) @ivar last_modified_date: sets the last modified date if set in the header, None otherwise @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting)

URIOpener(name, additional_headers={}, verify=True)
 57    def __init__(self, name, additional_headers={}, verify=True):
 58        """
 59        @param name: URL to be opened
 60        @keyword additional_headers: additional HTTP request headers to be added to the call
 61        """        
 62        try:
 63            # Note the removal of the fragment ID. This is necessary, per the HTTP spec
 64            url = name.split('#')[0]
 65            if 'Accept' not in additional_headers:
 66                additional_headers['Accept'] = 'text/html, application/xhtml+xml'
 67                
 68            import requests
 69            # For security reason certificate verification is now done by default. But, can be
 70            # disabled for sites still go wrong because the cerficates are not o.k. with request...
 71            r = requests.get(url, headers=additional_headers, verify=verify)
 72            self.data = r.content
 73            self.headers = r.headers
 74            
 75            if URIOpener.CONTENT_TYPE in self.headers:
 76                # The call below will remove the possible media type parameters, like charset settings
 77                ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
 78                self.content_type = ct.media_type
 79                if 'charset' in ct.parmdict:
 80                    self.charset = ct.parmdict['charset']
 81                else:
 82                    self.charset = None
 83                # print
 84            else:
 85                # check if the suffix can be used for the content type; this may be important
 86                # for file:// type URI or if the server is not properly set up to return the right
 87                # mime type
 88                self.charset = None
 89                self.content_type = ""
 90                for suffix in preferred_suffixes.keys():
 91                    if name.endswith(suffix):
 92                        self.content_type = preferred_suffixes[suffix]
 93                        break
 94            
 95            if URIOpener.CONTENT_LOCATION in self.headers:
 96                self.location = urljoin(r.url, self.headers[URIOpener.CONTENT_LOCATION])
 97            else:
 98                self.location = name
 99            
100            self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
101            if URIOpener.EXPIRES in self.headers:
102                try:
103                    # Thanks to Deron Meranda for the HTTP date conversion method...
104                    self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
105                except:
106                    # The Expires date format was wrong, sorry, forget it...
107                    pass
108
109            self.last_modified_date = None
110            if URIOpener.LAST_MODIFIED in self.headers:
111                try:
112                    # Thanks to Deron Meranda for the HTTP date conversion method...
113                    self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED])
114                except:
115                    # The last modified date format was wrong, sorry, forget it...
116                    pass
117                
118        except urllib_HTTPError:
119            e = sys.exc_info()[1]
120            from . import HTTPError
121            msg = BaseHTTPRequestHandler.responses[e.code]
122            raise HTTPError('%s' % msg[1], e.code)
123        except Exception:
124            e = sys.exc_info()[1]
125            from . import RDFaError
126            raise RDFaError('%s' % e)

@param name: URL to be opened @keyword additional_headers: additional HTTP request headers to be added to the call

CONTENT_LOCATION = 'Content-Location'
CONTENT_TYPE = 'Content-Type'
LAST_MODIFIED = 'Last-Modified'
EXPIRES = 'Expires'
def quote_URI(uri, options=None):
135def quote_URI(uri, options=None):
136    """
137    'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters
138    may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} 
139    is also in the uri, an extra warning is also generated.
140    @param uri: URI
141    @param options: 
142    @type options: L{Options<pyRdfa.Options>}
143    """
144    from . import err_unusual_char_in_URI
145    suri = uri.strip()
146    for c in _warnChars:
147        if suri.find(c) != -1:
148            if options != None:
149                options.add_warning(err_unusual_char_in_URI % suri)
150            break
151    return quote(suri, _unquotedChars)

'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} is also in the uri, an extra warning is also generated. @param uri: URI @param options: @type options: L{Options}

def create_file_name(uri):
155def create_file_name(uri):
156    """
157    Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file.
158    """
159    suri = uri.strip()
160    final_uri = quote(suri,_unquotedChars)
161    # Remove some potentially dangereous characters
162    return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_')

Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file.

def has_one_of_attributes(node, *args):
165def has_one_of_attributes(node,*args):
166    """
167    Check whether one of the listed attributes is present on a (DOM) node.
168    @param node: DOM element node
169    @param args: possible attribute names
170    @return: True or False
171    @rtype: Boolean
172    """
173    if len(args) == 0:
174        return None
175    if isinstance(args[0], tuple) or isinstance(args[0], list):
176        rargs = args[0]
177    else:
178        rargs = args
179    
180    return True in [ node.hasAttribute(attr) for attr in rargs ]

Check whether one of the listed attributes is present on a (DOM) node. @param node: DOM element node @param args: possible attribute names @return: True or False @rtype: Boolean

def traverse_tree(node, func):
183def traverse_tree(node, func):
184    """Traverse the whole element tree, and perform the function C{func} on all the elements.
185    @param node: DOM element node
186    @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped.
187    """
188    if func(node):
189        return
190
191    for n in node.childNodes:
192        if n.nodeType == node.ELEMENT_NODE:
193            traverse_tree(n, func)

Traverse the whole element tree, and perform the function C{func} on all the elements. @param node: DOM element node @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped.

def return_XML(state, inode, base=True, xmlns=True):
196def return_XML(state, inode, base=True, xmlns=True):
197    """
198    Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done
199    via a C{node.toxml} call of the xml minidom implementation.)
200
201    @param inode: DOM Node
202    @param state: L{pyRdfa.state.ExecutionContext}
203    @param base: whether the base element should be added to the output
204    @type base: Boolean
205    @param xmlns: whether the namespace declarations should be repeated in the generated node
206    @type xmlns: Boolean
207    @return: string
208    """
209    node = inode.cloneNode(True)
210    # Decorate the element with namespaces value and, optionally, base
211    if base:
212        node.setAttribute("xml:base",state.base)
213    if xmlns:
214        for prefix in state.term_or_curie.xmlns:
215            if not node.hasAttribute("xmlns:%s" % prefix):
216                node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix])
217        # Set the default namespace, if not done (and is available)
218        if not node.getAttribute("xmlns") and state.defaultNS != None:
219            node.setAttribute("xmlns", state.defaultNS)
220    return node.toxml()

Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done via a C{node.toxml} call of the xml minidom implementation.)

@param inode: DOM Node @param state: L{pyRdfa.state.ExecutionContext} @param base: whether the base element should be added to the output @type base: Boolean @param xmlns: whether the namespace declarations should be repeated in the generated node @type xmlns: Boolean @return: string

def dump(node):
224def dump(node):
225    """
226    This is just for debug purposes: it prints the essential content of the node in the tree starting at node.
227
228    @param node: DOM node
229    """
230    print(node.toprettyxml(indent="", newl=""))

This is just for debug purposes: it prints the essential content of the node in the tree starting at node.

@param node: DOM node