SecondLife: lib/python/indra/util/named

00001 """\
00002 @file named_query.py
00003 @author Ryan Williams, Phoenix
00004 @date 2007-07-31
00005 @brief An API for running named queries.
00006 
00007 $LicenseInfo:firstyear=2007&license=mit$
00008 
00009 Copyright (c) 2007-2008, Linden Research, Inc.
00010 
00011 Permission is hereby granted, free of charge, to any person obtaining a copy
00012 of this software and associated documentation files (the "Software"), to deal
00013 in the Software without restriction, including without limitation the rights
00014 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00015 copies of the Software, and to permit persons to whom the Software is
00016 furnished to do so, subject to the following conditions:
00017 
00018 The above copyright notice and this permission notice shall be included in
00019 all copies or substantial portions of the Software.
00020 
00021 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00022 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00023 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00024 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00025 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00026 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00027 THE SOFTWARE.
00028 $/LicenseInfo$
00029 """
00030 
00031 import errno
00032 import MySQLdb
00033 import MySQLdb.cursors
00034 import os
00035 import os.path
00036 import re
00037 import time
00038 
00039 #import sys # *TODO: remove. only used in testing.
00040 #import pprint # *TODO: remove. only used in testing.
00041 
00042 try:
00043     set = set
00044 except NameError:
00045     from sets import Set as set
00046 
00047 from indra.base import llsd
00048 from indra.base import config
00049 
00050 NQ_FILE_SUFFIX = config.get('named-query-file-suffix', '')
00051 NQ_FILE_SUFFIX_LEN  = len(NQ_FILE_SUFFIX)
00052 
00053 _g_named_manager = None
00054 
00055 def _init_g_named_manager(sql_dir = None):
00056     """Initializes a global NamedManager object to point at a
00057     specified named queries hierarchy.
00058 
00059     This function is intended entirely for testing purposes,
00060     because it's tricky to control the config from inside a test."""
00061     if sql_dir is None:
00062         sql_dir = config.get('named-query-base-dir')
00063 
00064     # extra fallback directory in case config doesn't return what we want
00065     if sql_dir is None:
00066         sql_dir = os.path.dirname(__file__) + "../../../../web/dataservice/sql"
00067 
00068     global _g_named_manager
00069     _g_named_manager = NamedQueryManager(
00070         os.path.abspath(os.path.realpath(sql_dir)))
00071 
00072 def get(name):
00073     "Get the named query object to be used to perform queries"
00074     if _g_named_manager is None:
00075         _init_g_named_manager()
00076     return _g_named_manager.get(name)
00077 
00078 def sql(connection, name, params):
00079     # use module-global NamedQuery object to perform default substitution
00080     return get(name).sql(connection, params)
00081 
00082 def run(connection, name, params, expect_rows = None):
00083     """\
00084 @brief given a connection, run a named query with the params
00085 
00086 Note that this function will fetch ALL rows.
00087 @param connection The connection to use
00088 @param name The name of the query to run
00089 @param params The parameters passed into the query
00090 @param expect_rows The number of rows expected. Set to 1 if return_as_map is true.  Raises ExpectationFailed if the number of returned rows doesn't exactly match.  Kind of a hack.
00091 @return Returns the result set as a list of dicts.
00092 """
00093     return get(name).run(connection, params, expect_rows)
00094 
00095 class ExpectationFailed(Exception):
00096     """ Exception that is raised when an expectation for an sql query
00097     is not met."""
00098     def __init__(self, message):
00099         Exception.__init__(self, message)
00100         self.message = message
00101 
00102 class NamedQuery(object):
00103     def __init__(self, name, filename):
00104         """ Construct a NamedQuery object.  The name argument is an
00105         arbitrary name as a handle for the query, and the filename is
00106         a path to a file containing an llsd named query document."""
00107         self._stat_interval_seconds = 5  # 5 seconds
00108         self._name = name
00109         if (filename is not None) \
00110                 and (NQ_FILE_SUFFIX != filename[-NQ_FILE_SUFFIX_LEN:]):
00111             filename = filename + NQ_FILE_SUFFIX
00112         self._location = filename
00113         self._alternative = dict()
00114         self._last_mod_time = 0
00115         self._last_check_time = 0
00116         self.deleted = False
00117         self.load_contents()
00118 
00119     def name(self):
00120         """ The name of the query. """
00121         return self._name
00122 
00123     def get_modtime(self):
00124         """ Returns the mtime (last modified time) of the named query
00125         file, if such exists."""
00126         if self._location:
00127             return os.path.getmtime(self._location)
00128         return 0
00129 
00130     def load_contents(self):
00131         """ Loads and parses the named query file into self.  Does
00132         nothing if self.location is nonexistant."""
00133         if self._location:
00134             self._reference_contents(llsd.parse(open(self._location).read()))
00135             # Check for alternative implementations
00136             try:
00137                 for name, alt in self._contents['alternative'].items():
00138                     nq = NamedQuery(name, None)
00139                     nq._reference_contents(alt)
00140                     self._alternative[name] = nq
00141             except KeyError, e:
00142                 pass
00143             self._last_mod_time = self.get_modtime()
00144             self._last_check_time = time.time()
00145 
00146     def _reference_contents(self, contents):
00147         "Helper method which builds internal structure from parsed contents"
00148         self._contents = contents
00149         self._ttl = int(self._contents.get('ttl', 0))
00150         self._return_as_map = bool(self._contents.get('return_as_map', False))
00151         self._legacy_dbname = self._contents.get('legacy_dbname', None)
00152 
00153         # reset these before doing the sql conversion because we will
00154         # read them there. reset these while loading so we pick up
00155         # changes.
00156         self._around = set()
00157         self._append = set()
00158         self._integer = set()
00159         self._options = self._contents.get('dynamic_where', {})
00160         for key in self._options:
00161             if isinstance(self._options[key], basestring):
00162                 self._options[key] = self._convert_sql(self._options[key])
00163             elif isinstance(self._options[key], list):
00164                 lines = []
00165                 for line in self._options[key]:
00166                     lines.append(self._convert_sql(line))
00167                 self._options[key] = lines
00168             else:
00169                 moreopt = {}
00170                 for kk in self._options[key]:
00171                     moreopt[kk] = self._convert_sql(self._options[key][kk]) 
00172                 self._options[key] = moreopt
00173         self._base_query = self._convert_sql(self._contents['base_query'])
00174         self._query_suffix = self._convert_sql(
00175             self._contents.get('query_suffix', ''))
00176 
00177     def _convert_sql(self, sql):
00178         """convert the parsed sql into a useful internal structure.
00179 
00180         This function has to turn the named query format into a pyformat
00181         style. It also has to look for %:name% and :name% and
00182         ready them for use in LIKE statements"""
00183         if sql:
00184             #print >>sys.stderr, "sql:",sql
00185             expr = re.compile("(%?):([a-zA-Z][a-zA-Z0-9_-]*)%")
00186             sql = expr.sub(self._prepare_like, sql)
00187             expr = re.compile("#:([a-zA-Z][a-zA-Z0-9_-]*)")
00188             sql = expr.sub(self._prepare_integer, sql)
00189             expr = re.compile(":([a-zA-Z][a-zA-Z0-9_-]*)")
00190             sql = expr.sub("%(\\1)s", sql)
00191         return sql
00192 
00193     def _prepare_like(self, match):
00194         """This function changes LIKE statement replace behavior
00195 
00196         It works by turning %:name% to %(_name_around)s and :name% to
00197         %(_name_append)s. Since a leading '_' is not a valid keyname
00198         input (enforced via unit tests), it will never clash with
00199         existing keys. Then, when building the statement, the query
00200         runner will generate corrected strings."""
00201         if match.group(1) == '%':
00202             # there is a leading % so this is treated as prefix/suffix
00203             self._around.add(match.group(2))
00204             return "%(" + self._build_around_key(match.group(2)) + ")s"
00205         else:
00206             # there is no leading %, so this is suffix only
00207             self._append.add(match.group(2))
00208             return "%(" + self._build_append_key(match.group(2)) + ")s"
00209 
00210     def _build_around_key(self, key):
00211         return "_" + key + "_around"
00212 
00213     def _build_append_key(self, key):
00214         return "_" + key + "_append"
00215 
00216     def _prepare_integer(self, match):
00217         """This function adjusts the sql for #:name replacements
00218 
00219         It works by turning #:name to %(_name_as_integer)s. Since a
00220         leading '_' is not a valid keyname input (enforced via unit
00221         tests), it will never clash with existing keys. Then, when
00222         building the statement, the query runner will generate
00223         corrected strings."""
00224         self._integer.add(match.group(1))
00225         return "%(" + self._build_integer_key(match.group(1)) + ")s"
00226 
00227     def _build_integer_key(self, key):
00228         return "_" + key + "_as_integer"
00229 
00230     def _strip_wildcards_to_list(self, value):
00231         """Take string, and strip out the LIKE special characters.
00232 
00233         Technically, this is database dependant, but postgresql and
00234         mysql use the same wildcards, and I am not aware of a general
00235         way to handle this. I think you need a sql statement of the
00236         form:
00237 
00238         LIKE_STRING( [ANY,ONE,str]... )
00239 
00240         which would treat ANY as their any string, and ONE as their
00241         single glyph, and str as something that needs database
00242         specific encoding to not allow any % or _ to affect the query.
00243 
00244         As it stands, I believe it's impossible to write a named query
00245         style interface which uses like to search the entire space of
00246         text available. Imagine the query:
00247 
00248         % of brain used by average linden
00249 
00250         In order to search for %, it must be escaped, so once you have
00251         escaped the string to not do wildcard searches, and be escaped
00252         for the database, and then prepended the wildcard you come
00253         back with one of:
00254 
00255         1) %\% of brain used by average linden
00256         2) %%% of brain used by average linden
00257 
00258         Then, when passed to the database to be escaped to be database
00259         safe, you get back:
00260         
00261         1) %\\% of brain used by average linden
00262         : which means search for any character sequence, followed by a
00263           backslash, followed by any sequence, followed by ' of
00264           brain...'
00265         2) %%% of brain used by average linden
00266         : which (I believe) means search for a % followed by any
00267           character sequence followed by 'of brain...'
00268 
00269         Neither of which is what we want!
00270 
00271         So, we need a vendor (or extention) for LIKE_STRING. Anyone
00272         want to write it?"""
00273         utf8_value = unicode(value, "utf-8")
00274         esc_list = []
00275         remove_chars = set(u"%_")
00276         for glyph in utf8_value:
00277             if glyph in remove_chars:
00278                 continue
00279             esc_list.append(glyph.encode("utf-8"))
00280         return esc_list
00281 
00282     def delete(self):
00283         """ Makes this query unusable by deleting all the members and
00284         setting the deleted member.  This is desired when the on-disk
00285         query has been deleted but the in-memory copy remains."""
00286         # blow away all members except _name, _location, and deleted
00287         name, location = self._name, self._location
00288         for key in self.__dict__.keys():
00289             del self.__dict__[key]
00290         self.deleted = True
00291         self._name, self._location = name, location
00292 
00293     def ttl(self):
00294         """ Estimated time to live of this query. Used for web
00295         services to set the Expires header."""
00296         return self._ttl
00297 
00298     def legacy_dbname(self):
00299         return self._legacy_dbname
00300 
00301     def return_as_map(self):
00302         """ Returns true if this query is configured to return its
00303         results as a single map (as opposed to a list of maps, the
00304         normal behavior)."""
00305         
00306         return self._return_as_map
00307 
00308     def for_schema(self, db_name):
00309         "Look trough the alternates and return the correct query"
00310         try:
00311             return self._alternative[db_name]
00312         except KeyError, e:
00313             pass
00314         return self
00315 
00316     def run(self, connection, params, expect_rows = None, use_dictcursor = True):
00317         """given a connection, run a named query with the params
00318 
00319         Note that this function will fetch ALL rows. We do this because it
00320         opens and closes the cursor to generate the values, and this 
00321         isn't a generator so the cursor has no life beyond the method call.
00322 
00323         @param cursor The connection to use (this generates its own cursor for the query)
00324         @param name The name of the query to run
00325         @param params The parameters passed into the query
00326         @param expect_rows The number of rows expected. Set to 1 if return_as_map is true.  Raises ExpectationFailed if the number of returned rows doesn't exactly match.  Kind of a hack.
00327         @param use_dictcursor Set to false to use a normal cursor and manually convert the rows to dicts.
00328         @return Returns the result set as a list of dicts, or, if the named query has return_as_map set to true, returns a single dict.
00329         """
00330         if use_dictcursor:
00331             cursor = connection.cursor(MySQLdb.cursors.DictCursor)
00332         else:
00333             cursor = connection.cursor()
00334         
00335         statement = self.sql(connection, params)
00336         #print "SQL:", statement
00337         rows = cursor.execute(statement)
00338         
00339         # *NOTE: the expect_rows argument is a very cheesy way to get some
00340         # validation on the result set.  If you want to add more expectation
00341         # logic, do something more object-oriented and flexible.  Or use an ORM.
00342         if(self._return_as_map):
00343             expect_rows = 1
00344         if expect_rows is not None and rows != expect_rows:
00345             cursor.close()
00346             raise ExpectationFailed("Statement expected %s rows, got %s.  Sql: %s" % (
00347                 expect_rows, rows, statement))
00348 
00349         # convert to dicts manually if we're not using a dictcursor
00350         if use_dictcursor:
00351             result_set = cursor.fetchall()
00352         else:
00353             if cursor.description is None:
00354                 # an insert or something
00355                 x = cursor.fetchall()
00356                 cursor.close()
00357                 return x
00358 
00359             names = [x[0] for x in cursor.description]
00360 
00361             result_set = []
00362             for row in cursor.fetchall():
00363                 converted_row = {}
00364                 for idx, col_name in enumerate(names):
00365                     converted_row[col_name] = row[idx]
00366                 result_set.append(converted_row)
00367 
00368         cursor.close()
00369         if self._return_as_map:
00370             return result_set[0]
00371         return result_set
00372 
00373     def sql(self, connection, params):
00374         """ Generates an SQL statement from the named query document
00375         and a dictionary of parameters.
00376 
00377         """
00378         self.refresh()
00379 
00380         # build the query from the options available and the params
00381         base_query = []
00382         base_query.append(self._base_query)
00383         #print >>sys.stderr, "base_query:",base_query
00384         for opt, extra_where in self._options.items():
00385             if type(extra_where) in (dict, list, tuple):
00386                 if opt in params:
00387                     base_query.append(extra_where[params[opt]])
00388             else:
00389                 if opt in params and params[opt]:
00390                     base_query.append(extra_where)
00391         if self._query_suffix:
00392             base_query.append(self._query_suffix)
00393         #print >>sys.stderr, "base_query:",base_query
00394         full_query = '\n'.join(base_query)
00395 
00396         # Go through the query and rewrite all of the ones with the
00397         # @:name syntax.
00398         rewrite = _RewriteQueryForArray(params)
00399         expr = re.compile("@%\(([a-zA-Z][a-zA-Z0-9_-]*)\)s")
00400         full_query = expr.sub(rewrite.operate, full_query)
00401         params.update(rewrite.new_params)
00402 
00403         # build out the params for like. We only have to do this
00404         # parameters which were detected to have ued the where syntax
00405         # during load.
00406         #
00407         # * treat the incoming string as utf-8
00408         # * strip wildcards
00409         # * append or prepend % as appropriate
00410         new_params = {}
00411         for key in params:
00412             if key in self._around:
00413                 new_value = ['%']
00414                 new_value.extend(self._strip_wildcards_to_list(params[key]))
00415                 new_value.append('%')
00416                 new_params[self._build_around_key(key)] = ''.join(new_value)
00417             if key in self._append:
00418                 new_value = self._strip_wildcards_to_list(params[key])
00419                 new_value.append('%')
00420                 new_params[self._build_append_key(key)] = ''.join(new_value)
00421             if key in self._integer:
00422                 new_params[self._build_integer_key(key)] = int(params[key])
00423         params.update(new_params)
00424 
00425         # do substitution using the mysql (non-standard) 'literal'
00426         # function to do the escaping.
00427         sql = full_query % connection.literal(params)
00428         return sql
00429 
00430     def refresh(self):
00431         """ Refresh self from the file on the filesystem.
00432 
00433         This is optimized to be callable as frequently as you wish,
00434         without adding too much load.  It does so by only stat-ing the
00435         file every N seconds, where N defaults to 5 and is
00436         configurable through the member _stat_interval_seconds.  If the stat
00437         reveals that the file has changed, refresh will re-parse the
00438         contents of the file and use them to update the named query
00439         instance.  If the stat reveals that the file has been deleted,
00440         refresh will call self.delete to make the in-memory
00441         representation unusable."""
00442         now = time.time()
00443         if(now - self._last_check_time > self._stat_interval_seconds):
00444             self._last_check_time = now
00445             try:
00446                 modtime = self.get_modtime()
00447                 if(modtime > self._last_mod_time):
00448                     self.load_contents()
00449             except OSError, e:
00450                 if e.errno == errno.ENOENT: # file not found
00451                     self.delete() # clean up self
00452                 raise  # pass the exception along to the caller so they know that this query disappeared
00453 
00454 class NamedQueryManager(object):
00455     """ Manages the lifespan of NamedQuery objects, drawing from a
00456     directory hierarchy of named query documents.
00457 
00458     In practice this amounts to a memory cache of NamedQuery objects."""
00459     
00460     def __init__(self, named_queries_dir):
00461         """ Initializes a manager to look for named queries in a
00462         directory."""
00463         self._dir = os.path.abspath(os.path.realpath(named_queries_dir))
00464         self._cached_queries = {}
00465 
00466     def sql(self, connection, name, params):
00467         nq = self.get(name)
00468         return nq.sql(connection, params)
00469         
00470     def get(self, name):
00471         """ Returns a NamedQuery instance based on the name, either
00472         from memory cache, or by parsing from disk.
00473 
00474         The name is simply a relative path to the directory associated
00475         with the manager object.  Before returning the instance, the
00476         NamedQuery object is cached in memory, so that subsequent
00477         accesses don't have to read from disk or do any parsing.  This
00478         means that NamedQuery objects returned by this method are
00479         shared across all users of the manager object.
00480         NamedQuery.refresh is used to bring the NamedQuery objects in
00481         sync with the actual files on disk."""
00482         nq = self._cached_queries.get(name)
00483         if nq is None:
00484             nq = NamedQuery(name, os.path.join(self._dir, name))
00485             self._cached_queries[name] = nq
00486         else:
00487             try:
00488                 nq.refresh()
00489             except OSError, e:
00490                 if e.errno == errno.ENOENT: # file not found
00491                     del self._cached_queries[name]
00492                 raise # pass exception along to caller so they know that the query disappeared
00493 
00494         return nq
00495 
00496 class _RewriteQueryForArray(object):
00497     "Helper class for rewriting queries with the @:name syntax"
00498     def __init__(self, params):
00499         self.params = params
00500         self.new_params = dict()
00501 
00502     def operate(self, match):
00503         "Given a match, return the string that should be in use"
00504         key = match.group(1)
00505         value = self.params[key]
00506         if type(value) in (list,tuple):
00507             rv = []
00508             for idx in range(len(value)):
00509                 # if the value@idx is array-like, we are
00510                 # probably dealing with a VALUES
00511                 new_key = "_%s_%s"%(key, str(idx))
00512                 val_item = value[idx]
00513                 if type(val_item) in (list, tuple, dict):
00514                     if type(val_item) is dict:
00515                         # this is because in Python, the order of 
00516                         # key, value retrieval from the dict is not
00517                         # guaranteed to match what the input intended
00518                         # and for VALUES, order is important.
00519                         # TODO: Implemented ordered dict in LLSD parser?
00520                         raise ExpectationFailed('Only lists/tuples allowed,\
00521                                 received dict')
00522                     values_keys = []
00523                     for value_idx, item in enumerate(val_item):
00524                         # we want a key of the format :
00525                         # key_#replacement_#value_row_#value_col
00526                         # ugh... so if we are replacing 10 rows in user_note, 
00527                         # the first values clause would read (for @:user_notes) :-
00528                         # ( :_user_notes_0_1_1,  :_user_notes_0_1_2, :_user_notes_0_1_3 )
00529                         # the input LLSD for VALUES will look like:
00530                         # <llsd>...
00531                         # <map>
00532                         #  <key>user_notes</key>
00533                         #      <array>
00534                         #      <array> <!-- row 1 for VALUES -->
00535                         #          <string>...</string>
00536                         #          <string>...</string>
00537                         #          <string>...</string>
00538                         #      </array>
00539                         # ...
00540                         #      </array>
00541                         # </map>
00542                         # ... </llsd>
00543                         values_key = "%s_%s"%(new_key, value_idx)
00544                         self.new_params[values_key] = item
00545                         values_keys.append("%%(%s)s"%values_key)
00546                     # now collapse all these new place holders enclosed in ()
00547                     # from [':_key_0_1_1', ':_key_0_1_2', ':_key_0_1_3,...] 
00548                     # rv will have [ '(:_key_0_1_1, :_key_0_1_2, :_key_0_1_3)', ]
00549                     # which is flattened a few lines below join(rv)
00550                     rv.append('(%s)' % ','.join(values_keys))
00551                 else:
00552                     self.new_params[new_key] = val_item
00553                     rv.append("%%(%s)s"%new_key)
00554             return ','.join(rv)
00555         else:
00556             # not something that can be expanded, so just drop the
00557             # leading @ in the front of the match. This will mean that
00558             # the single value we have, be it a string, int, whatever
00559             # (other than dict) will correctly show up, eg:
00560             #
00561             # where foo in (@:foobar) -- foobar is a string, so we get
00562             # where foo in (:foobar)
00563             return match.group(0)[1:]