| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | """ |
|---|
| 3 | Copyright: 2008, KMM Knowledge Management in Museums (www.museum24.se) |
|---|
| 4 | KMM is owned by Lulea University of Technology (www.ltu.se) |
|---|
| 5 | Manager of KMM is Ann.Hagerfors@ltu.se |
|---|
| 6 | |
|---|
| 7 | License: GPL |
|---|
| 8 | Author: jonas.beckman@kamidev.net |
|---|
| 9 | |
|---|
| 10 | Simple search against RDF data on remote SPARQL endpoints. |
|---|
| 11 | |
|---|
| 12 | SPARQL is a query language for RDF data stored as triples (subject, |
|---|
| 13 | predicate, object). See SPARQL FAQ for more background: |
|---|
| 14 | http://thefigtrees.net/lee/sw/sparql-faq, |
|---|
| 15 | |
|---|
| 16 | Currently this script is setup to search SKOS data. Read more about |
|---|
| 17 | SKOS here: http://www.w3.org/TR/skos-primer/. |
|---|
| 18 | |
|---|
| 19 | Code for HTTP queries is built on top of sparql-wrapper 1.1: |
|---|
| 20 | http://sparql-wrapper.sourceforge.net/ |
|---|
| 21 | |
|---|
| 22 | XML/RDF processing uses rdflib 2.4.1.dev-r1438. Download here: |
|---|
| 23 | http://rdflib.net/ (or use "easy_install -U "rdflib>=2.4,<=3.0a") |
|---|
| 24 | |
|---|
| 25 | Query results are returned as JSON by default and parsed with |
|---|
| 26 | Simplejson: http://pypi.python.org/pypi/simplejson |
|---|
| 27 | (or "easy_install simplejson") |
|---|
| 28 | |
|---|
| 29 | Date: 2008-04-20 1.0 looks up KMM online data on current test server |
|---|
| 30 | 2008-04-26 1.1 better search arguments. Data still on test server |
|---|
| 31 | 2008-05-05 2.0 renamed to skosfind.py. More options, Swedish characters |
|---|
| 32 | 2008-05-11 2.1 simplified API using optparse library |
|---|
| 33 | |
|---|
| 34 | Sample usage: |
|---|
| 35 | |
|---|
| 36 | >> python skosfind.py -h |
|---|
| 37 | Usage: skosfind.py [options] arg |
|---|
| 38 | |
|---|
| 39 | Options: |
|---|
| 40 | -h, --help show this help message and exit |
|---|
| 41 | -c, --case_sensitive Case sensitive search |
|---|
| 42 | -e, --everywhere Search everywhere in text |
|---|
| 43 | -a, --all Match using all RDF predicates |
|---|
| 44 | -p PREDICATE, --predicate=PREDICATE |
|---|
| 45 | Match only named predicate |
|---|
| 46 | -d, --display_alias Display alias(es) of search result |
|---|
| 47 | -l, --display_collection |
|---|
| 48 | Display membership in collections |
|---|
| 49 | -w, --webbrowser Display result in default web browser |
|---|
| 50 | |
|---|
| 51 | >> python skosfind.py bly |
|---|
| 52 | |
|---|
| 53 | === Search KMM Classmaster (RDF triple store) === |
|---|
| 54 | Current server: http://195.67.120.22:8080/openrdf-sesame/repositories/KMM_ALL |
|---|
| 55 | Case sensitive: False |
|---|
| 56 | Match beginning of string: True |
|---|
| 57 | Match predicate: skos:prefLabel |
|---|
| 58 | Search for: bly |
|---|
| 59 | |
|---|
| 60 | -- Results -- |
|---|
| 61 | |
|---|
| 62 | blygruva(-or) |
|---|
| 63 | http://www.geonames.org/ontology#S.MNPB |
|---|
| 64 | Predicate: skos:prefLabel@sv |
|---|
| 65 | |
|---|
| 66 | bly |
|---|
| 67 | http://classmaster.museum24.se/concepts/00000119cfa3fc218b3efd9a007f000000010001 |
|---|
| 68 | Predicate: skos:prefLabel@sv |
|---|
| 69 | |
|---|
| 70 | blyerts |
|---|
| 71 | http://classmaster.museum24.se/concepts/00000119cfa3fc218b3efd9b007f000000010001 |
|---|
| 72 | Predicate: skos:prefLabel@sv |
|---|
| 73 | |
|---|
| 74 | Blysigill |
|---|
| 75 | http://classmaster.museum24.se/concepts/00000119cfa3fa0cdf666f93007f000000010001 |
|---|
| 76 | Predicate: skos:prefLabel@sv |
|---|
| 77 | |
|---|
| 78 | Number of matches: 4 |
|---|
| 79 | |
|---|
| 80 | """ |
|---|
| 81 | import sys |
|---|
| 82 | import time |
|---|
| 83 | import urllib |
|---|
| 84 | import webbrowser |
|---|
| 85 | from optparse import OptionParser |
|---|
| 86 | from SPARQLWrapper import SPARQLWrapper, JSON |
|---|
| 87 | from rdflib import Literal |
|---|
| 88 | import re |
|---|
| 89 | |
|---|
| 90 | DEFAULT_REPO = "KMM_ALL" # Current Classmaster test data merged into one repo |
|---|
| 91 | DEFAULT_SERVER="http://195.67.120.22:8080/openrdf-sesame/repositories/"+ \ |
|---|
| 92 | DEFAULT_REPO |
|---|
| 93 | |
|---|
| 94 | WORKBENCH = "http://195.67.120.22:8080/openrdf-workbench/repository/" |
|---|
| 95 | EXPLORE_URL = WORKBENCH + "explore/resource.view?resource=<" |
|---|
| 96 | VIEW_REPO_URL = WORKBENCH + "overview.view?id=" |
|---|
| 97 | |
|---|
| 98 | SKOS_CORE = "http://www.w3.org/2004/02/skos/core#" |
|---|
| 99 | OWL_NAMESPACE = "http://www.w3.org/2002/07/owl#" |
|---|
| 100 | RDF_SYNTAX = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" |
|---|
| 101 | ANY_PREDICATE = "?p" |
|---|
| 102 | DEFAULT_PREDICATE = "skos:prefLabel" |
|---|
| 103 | |
|---|
| 104 | def get_triples(search_term,match_from_start, predicate,case_sensitive): |
|---|
| 105 | """Find triples (subject, predicate, object) matching searchstring |
|---|
| 106 | |
|---|
| 107 | match_from_start: match only text at beginning of string literals. |
|---|
| 108 | predicate: match named predicate. Use ANY_PREDICATE |
|---|
| 109 | case_sensitive: True or False |
|---|
| 110 | """ |
|---|
| 111 | sparql = SPARQLWrapper(DEFAULT_SERVER) |
|---|
| 112 | regex_start = '''FILTER(regex(str(?o),"''' # Find anywhere in string |
|---|
| 113 | if match_from_start: |
|---|
| 114 | regex_start = '''FILTER(regex(str(?o),"^''' # Match start of string |
|---|
| 115 | if case_sensitive: # This is Sesame and SPARQL default |
|---|
| 116 | regex_end = '''")).}''' |
|---|
| 117 | else: |
|---|
| 118 | regex_end = '''", "i")).}''' # case-insenstive 'i' flag |
|---|
| 119 | query = '''PREFIX skos:<'''+SKOS_CORE+'''> SELECT ?s ?p ?o WHERE {?s '''+ predicate +''' ?o. '''+regex_start+ search_term + regex_end |
|---|
| 120 | sparql.setQuery(query) |
|---|
| 121 | sparql.setReturnFormat(JSON) |
|---|
| 122 | results = sparql.query().convert() |
|---|
| 123 | return query,results |
|---|
| 124 | |
|---|
| 125 | def get_sameAs(uri,server=DEFAULT_SERVER): |
|---|
| 126 | '''For a given uri, find identical concepts using owl:sameAs ''' |
|---|
| 127 | |
|---|
| 128 | sparql = SPARQLWrapper(server) |
|---|
| 129 | filter_start = '''FILTER(str(?o) = "''' # string starts with |
|---|
| 130 | |
|---|
| 131 | query = '''PREFIX skos:<'''+SKOS_CORE+'''> |
|---|
| 132 | PREFIX owl:<'''+OWL_NAMESPACE+'''> |
|---|
| 133 | SELECT DISTINCT ?s WHERE {?s owl:sameAs ?o. FILTER(str(?o) ="'''+ uri +'''").}''' |
|---|
| 134 | sparql.setQuery(query) |
|---|
| 135 | sparql.setReturnFormat(JSON) |
|---|
| 136 | results = sparql.query().convert() |
|---|
| 137 | return query,results |
|---|
| 138 | |
|---|
| 139 | def get_inScheme(uri,server=DEFAULT_SERVER): |
|---|
| 140 | '''List all SKOS conceptschemes a given uri belongs to''' |
|---|
| 141 | |
|---|
| 142 | sparql = SPARQLWrapper(server) |
|---|
| 143 | filter_start = '''FILTER(str(?o) = "''' # string starts with |
|---|
| 144 | |
|---|
| 145 | query = '''PREFIX skos:<'''+SKOS_CORE+'''> |
|---|
| 146 | PREFIX owl:<'''+OWL_NAMESPACE+'''> |
|---|
| 147 | SELECT DISTINCT ?o WHERE {?s skos:inScheme ?o. FILTER(str(?s) ="'''+ uri +'''").}''' |
|---|
| 148 | sparql.setQuery(query) |
|---|
| 149 | sparql.setReturnFormat(JSON) |
|---|
| 150 | results = sparql.query().convert() |
|---|
| 151 | return query,results |
|---|
| 152 | |
|---|
| 153 | def _strip_prefix(string,prefix = SKOS_CORE): |
|---|
| 154 | start = len(prefix) |
|---|
| 155 | return string[start:] |
|---|
| 156 | |
|---|
| 157 | def show_in_browser(uri): |
|---|
| 158 | '''Show URI in default web browser. |
|---|
| 159 | |
|---|
| 160 | First select and open the right repository. |
|---|
| 161 | Then display results for uri in the same browser window |
|---|
| 162 | ''' |
|---|
| 163 | webbrowser.open(VIEW_REPO_URL+DEFAULT_REPO) |
|---|
| 164 | time.sleep(3) # Wait to make sure the right repository is opened first |
|---|
| 165 | webbrowser.open(uri,new=0) # open in same window if possible" |
|---|
| 166 | |
|---|
| 167 | def number_of_matches(results): |
|---|
| 168 | '''Parse JSON output and return number of matches''' |
|---|
| 169 | return len(results["results"]["bindings"]) |
|---|
| 170 | |
|---|
| 171 | def _printresults(results,predicate,search_sameAs=True,search_inScheme=True): |
|---|
| 172 | '''Parse and print JSON output from HTTP SPARQL query |
|---|
| 173 | |
|---|
| 174 | Optionally search and display identical concepts (using owl:sameAs) |
|---|
| 175 | Optionally search and display membership in collections (using skos:inScheme) |
|---|
| 176 | ''' |
|---|
| 177 | num_matches = number_of_matches(results) |
|---|
| 178 | if num_matches > 0: |
|---|
| 179 | print "-- Search results --" |
|---|
| 180 | print |
|---|
| 181 | subject = None |
|---|
| 182 | for result in results["results"]["bindings"]: |
|---|
| 183 | subject = result["s"]["value"] |
|---|
| 184 | object = result["o"]["value"] |
|---|
| 185 | |
|---|
| 186 | print object.encode("utf-8") |
|---|
| 187 | print subject |
|---|
| 188 | |
|---|
| 189 | if predicate == ANY_PREDICATE: # Special case |
|---|
| 190 | predicate_value = _strip_prefix( result["p"]["value"]) |
|---|
| 191 | else: |
|---|
| 192 | predicate_value = predicate |
|---|
| 193 | try : # Display language tag if there is one |
|---|
| 194 | language = result["o"]["xml:lang"] |
|---|
| 195 | print "Predicate: " + predicate_value + "@" + language |
|---|
| 196 | except: |
|---|
| 197 | print "Predicate: " + predicate_value |
|---|
| 198 | |
|---|
| 199 | # Optionally display identical concepts using owl:sameAs |
|---|
| 200 | if search_sameAs: |
|---|
| 201 | query,alternatives = get_sameAs(subject) |
|---|
| 202 | for alt in alternatives["results"]["bindings"]: |
|---|
| 203 | v = alt["s"]["value"] |
|---|
| 204 | print "Alias: "+v |
|---|
| 205 | # Optionally display collection membership úsing skos:inScheme |
|---|
| 206 | if search_inScheme: |
|---|
| 207 | query,schemes = get_inScheme(subject) |
|---|
| 208 | for scheme in schemes["results"]["bindings"]: |
|---|
| 209 | v = scheme["o"]["value"] |
|---|
| 210 | print "Collection: "+v |
|---|
| 211 | print |
|---|
| 212 | print "Number of matches: " + str(num_matches) |
|---|
| 213 | print |
|---|
| 214 | |
|---|
| 215 | def _get_last_result(results): |
|---|
| 216 | for result in results["results"]["bindings"]: |
|---|
| 217 | subject = result["s"]["value"] |
|---|
| 218 | return subject |
|---|
| 219 | |
|---|
| 220 | def _printheader(label,match_from_start,predicate,case_sensitive): |
|---|
| 221 | print |
|---|
| 222 | print "=== Search KMM Classmaster (RDF triple store) ===" |
|---|
| 223 | print "Current server: " + DEFAULT_SERVER |
|---|
| 224 | if predicate == ANY_PREDICATE: |
|---|
| 225 | print "Match all predicates " |
|---|
| 226 | else: |
|---|
| 227 | print "Filter: "+predicate |
|---|
| 228 | print "Match start of string: " + str(match_from_start) |
|---|
| 229 | print "Case sensitive: " + str(case_sensitive) |
|---|
| 230 | print "Search for: "+unicode_escape_to_sw(label) #+label.encode("utf-8") |
|---|
| 231 | print |
|---|
| 232 | |
|---|
| 233 | def sw_to_unicode_escape(label): |
|---|
| 234 | '''Silly fix to setup unicode escapes for HTTP call''' |
|---|
| 235 | label = label.replace("Ã¥","\\u00E5") |
|---|
| 236 | label = label.replace("À","\\u00E4") |
|---|
| 237 | label = label.replace("ö","\\u00F6") |
|---|
| 238 | label = label.replace("Ã |
|---|
| 239 | ","\\u00C5") |
|---|
| 240 | label = label.replace("Ã","\\u00C4") |
|---|
| 241 | label = label.replace("Ã","\\u00D6") |
|---|
| 242 | return label |
|---|
| 243 | |
|---|
| 244 | def unicode_escape_to_sw(label): |
|---|
| 245 | '''Silly fix to setup unicode escapes for HTTP call''' |
|---|
| 246 | label = label.replace("\\u00E5","Ã¥") |
|---|
| 247 | label = label.replace("\\u00E4","À") |
|---|
| 248 | label = label.replace("\\u00F6","ö") |
|---|
| 249 | label = label.replace("\\u00C5","Ã |
|---|
| 250 | ") |
|---|
| 251 | label = label.replace("\\u00C4","Ã") |
|---|
| 252 | label = label.replace("\\u00D6","Ã") |
|---|
| 253 | return label |
|---|
| 254 | |
|---|
| 255 | def _optparse_parse(): |
|---|
| 256 | usage = "usage: %prog [options] arg" |
|---|
| 257 | parser = OptionParser(usage) |
|---|
| 258 | parser.add_option("-c", "--case_sensitive", action="store_true", |
|---|
| 259 | dest="case_sensitive",help="Case sensitive search" ) |
|---|
| 260 | parser.add_option("-e", "--everywhere", action="store_true",dest="everywhere", |
|---|
| 261 | help="Search everywhere in text" ) |
|---|
| 262 | parser.add_option("-a", "--all", action="store_true",dest="search_all", |
|---|
| 263 | help="Match using all RDF predicates") |
|---|
| 264 | parser.add_option("-p", "--predicate", action="store", type="string", |
|---|
| 265 | dest="predicate",help="Match only named predicate") |
|---|
| 266 | parser.add_option("-d", "--display_alias", |
|---|
| 267 | action="store_true",dest="display_alias", |
|---|
| 268 | help="Display alias(es) of search result" ) |
|---|
| 269 | parser.add_option("-l", "--display_collection",action="store_true", |
|---|
| 270 | dest="display_collection", |
|---|
| 271 | help="Display membership in collections" ) |
|---|
| 272 | parser.add_option("-w", "--webbrowser", action="store_true", |
|---|
| 273 | dest="webbrowser", |
|---|
| 274 | help="Display result in default web browser" ) |
|---|
| 275 | |
|---|
| 276 | parser.set_defaults(search_all=False) |
|---|
| 277 | parser.set_defaults(case_sensitive=False) |
|---|
| 278 | parser.set_defaults(everywhere=False) |
|---|
| 279 | parser.set_defaults(webbrowser=False) |
|---|
| 280 | parser.set_defaults(display_alias=False) |
|---|
| 281 | parser.set_defaults(display_collection=False) |
|---|
| 282 | (options, args) = parser.parse_args() |
|---|
| 283 | if len(args) != 1: |
|---|
| 284 | parser.print_help() |
|---|
| 285 | print |
|---|
| 286 | parser.error("What do you want to search for?") |
|---|
| 287 | return options,args |
|---|
| 288 | |
|---|
| 289 | def main(): |
|---|
| 290 | options,args = _optparse_parse() # Parse command line arguments from user |
|---|
| 291 | if options.search_all: |
|---|
| 292 | options.predicate = ANY_PREDICATE |
|---|
| 293 | elif options.predicate: |
|---|
| 294 | pass # User argument is already set |
|---|
| 295 | else: |
|---|
| 296 | options.predicate = DEFAULT_PREDICATE #"skos:prefLabel" |
|---|
| 297 | match_from_start = not options.everywhere #Search from beginning of string? |
|---|
| 298 | search_term = sw_to_unicode_escape(args[0]) #Fix some non-ascii characters |
|---|
| 299 | |
|---|
| 300 | # Search and display results |
|---|
| 301 | _printheader(search_term, |
|---|
| 302 | match_from_start, |
|---|
| 303 | options.predicate, |
|---|
| 304 | options.case_sensitive) |
|---|
| 305 | query, results = get_triples(search_term, |
|---|
| 306 | match_from_start, |
|---|
| 307 | options.predicate, |
|---|
| 308 | options.case_sensitive) |
|---|
| 309 | _printresults(results, |
|---|
| 310 | options.predicate, |
|---|
| 311 | options.display_alias,options. |
|---|
| 312 | display_collection) |
|---|
| 313 | |
|---|
| 314 | # Optionally display last selected subject in web browser |
|---|
| 315 | if number_of_matches(results) and options.webbrowser > 0: |
|---|
| 316 | show_in_browser(EXPLORE_URL+_get_last_result(results)+">") |
|---|
| 317 | |
|---|
| 318 | if __name__ == "__main__": |
|---|
| 319 | main() |
|---|