Ticket #25: skosfind.py

File skosfind.py, 11.8 kB (added by http://jonasbeckman.myopenid.com/, 8 months ago)
Line 
1# -*- coding: utf-8 -*-
2"""
3Copyright: 2008, KMM Knowledge Management in Museums (www.museum24.se)
4KMM is owned by Lulea University of Technology (www.ltu.se)
5Manager of KMM is Ann.Hagerfors@ltu.se
6
7License: GPL
8Author:  jonas.beckman@kamidev.net
9       
10        Simple search against RDF data on remote SPARQL endpoints.
11   
12        SPARQL is a query language for RDF data stored as triples (subject,
13        predicate, object). See SPARQL FAQ for more background:
14        http://thefigtrees.net/lee/sw/sparql-faq,
15       
16        Currently this script is setup to search SKOS data. Read more about
17        SKOS here: http://www.w3.org/TR/skos-primer/.
18       
19        Code for HTTP queries is built on top of sparql-wrapper 1.1:
20        http://sparql-wrapper.sourceforge.net/
21       
22        XML/RDF processing uses rdflib 2.4.1.dev-r1438. Download here:
23        http://rdflib.net/ (or use "easy_install -U "rdflib>=2.4,<=3.0a")
24       
25        Query results are returned as JSON by default and parsed with
26        Simplejson: http://pypi.python.org/pypi/simplejson
27        (or "easy_install simplejson")
28       
29Date:   2008-04-20  1.0 looks up KMM online data on current test server
30        2008-04-26  1.1 better search arguments. Data still on test server
31        2008-05-05  2.0 renamed to skosfind.py. More options, Swedish characters
32        2008-05-11  2.1 simplified API using optparse library
33
34Sample usage:
35
36>>   python skosfind.py -h
37Usage: skosfind.py [options] arg
38
39Options:
40  -h, --help            show this help message and exit
41  -c, --case_sensitive  Case sensitive search
42  -e, --everywhere      Search everywhere in text
43  -a, --all             Match using all RDF predicates
44  -p PREDICATE, --predicate=PREDICATE
45                        Match only named predicate
46  -d, --display_alias   Display alias(es) of search result
47  -l, --display_collection
48                        Display membership in collections
49  -w, --webbrowser      Display result in default web browser
50
51>>  python skosfind.py bly
52
53=== Search KMM Classmaster (RDF triple store) ===
54Current server: http://195.67.120.22:8080/openrdf-sesame/repositories/KMM_ALL
55Case sensitive: False
56Match beginning of string: True
57Match predicate: skos:prefLabel
58Search for: bly
59
60-- Results --
61
62blygruva(-or)
63http://www.geonames.org/ontology#S.MNPB
64Predicate: skos:prefLabel@sv
65
66bly
67http://classmaster.museum24.se/concepts/00000119cfa3fc218b3efd9a007f000000010001
68Predicate: skos:prefLabel@sv
69
70blyerts
71http://classmaster.museum24.se/concepts/00000119cfa3fc218b3efd9b007f000000010001
72Predicate: skos:prefLabel@sv
73
74Blysigill
75http://classmaster.museum24.se/concepts/00000119cfa3fa0cdf666f93007f000000010001
76Predicate: skos:prefLabel@sv
77
78Number of matches: 4
79
80"""
81import sys
82import time
83import urllib
84import webbrowser
85from optparse import OptionParser
86from SPARQLWrapper import SPARQLWrapper, JSON
87from rdflib import Literal
88import re
89
90DEFAULT_REPO = "KMM_ALL" # Current Classmaster test data merged into one repo
91DEFAULT_SERVER="http://195.67.120.22:8080/openrdf-sesame/repositories/"+ \
92              DEFAULT_REPO
93
94WORKBENCH = "http://195.67.120.22:8080/openrdf-workbench/repository/"
95EXPLORE_URL = WORKBENCH + "explore/resource.view?resource=<"
96VIEW_REPO_URL = WORKBENCH + "overview.view?id="
97
98SKOS_CORE = "http://www.w3.org/2004/02/skos/core#"
99OWL_NAMESPACE = "http://www.w3.org/2002/07/owl#"
100RDF_SYNTAX = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
101ANY_PREDICATE = "?p"
102DEFAULT_PREDICATE = "skos:prefLabel"
103
104def get_triples(search_term,match_from_start, predicate,case_sensitive):
105    """Find triples (subject, predicate, object) matching searchstring
106   
107    match_from_start: match only text at beginning of string literals.
108    predicate: match named predicate. Use ANY_PREDICATE
109    case_sensitive: True or False
110    """
111    sparql = SPARQLWrapper(DEFAULT_SERVER)   
112    regex_start = '''FILTER(regex(str(?o),"'''      # Find anywhere in string
113    if match_from_start:
114        regex_start = '''FILTER(regex(str(?o),"^''' # Match start of string
115    if case_sensitive: # This is Sesame and SPARQL default
116        regex_end = '''")).}'''
117    else:
118        regex_end = '''", "i")).}''' # case-insenstive 'i' flag
119    query = '''PREFIX skos:<'''+SKOS_CORE+'''> SELECT ?s ?p ?o  WHERE {?s '''+ predicate +''' ?o. '''+regex_start+ search_term + regex_end
120    sparql.setQuery(query)
121    sparql.setReturnFormat(JSON)
122    results = sparql.query().convert()
123    return query,results
124
125def get_sameAs(uri,server=DEFAULT_SERVER):
126    '''For a given uri, find identical concepts using owl:sameAs '''
127   
128    sparql = SPARQLWrapper(server)
129    filter_start = '''FILTER(str(?o) = "''' # string starts with
130   
131    query = '''PREFIX skos:<'''+SKOS_CORE+'''>
132    PREFIX owl:<'''+OWL_NAMESPACE+'''>
133    SELECT DISTINCT ?s  WHERE {?s owl:sameAs ?o. FILTER(str(?o) ="'''+ uri +'''").}'''
134    sparql.setQuery(query)
135    sparql.setReturnFormat(JSON)
136    results = sparql.query().convert()
137    return query,results
138
139def get_inScheme(uri,server=DEFAULT_SERVER):
140    '''List all SKOS conceptschemes a given uri belongs to'''
141   
142    sparql = SPARQLWrapper(server)
143    filter_start = '''FILTER(str(?o) = "''' # string starts with
144   
145    query = '''PREFIX skos:<'''+SKOS_CORE+'''>
146    PREFIX owl:<'''+OWL_NAMESPACE+'''>
147    SELECT DISTINCT ?o  WHERE {?s skos:inScheme ?o. FILTER(str(?s) ="'''+ uri +'''").}'''
148    sparql.setQuery(query)
149    sparql.setReturnFormat(JSON)
150    results = sparql.query().convert()
151    return query,results
152
153def _strip_prefix(string,prefix = SKOS_CORE):
154    start = len(prefix)
155    return string[start:]
156
157def show_in_browser(uri):
158    '''Show URI in default web browser.
159   
160    First select and open the right repository.
161    Then display results for uri in the same browser window
162    '''
163    webbrowser.open(VIEW_REPO_URL+DEFAULT_REPO)
164    time.sleep(3) # Wait to make sure the right repository is opened first
165    webbrowser.open(uri,new=0) # open in same window if possible"
166
167def number_of_matches(results):
168    '''Parse JSON output and return number of matches'''
169    return len(results["results"]["bindings"])
170
171def _printresults(results,predicate,search_sameAs=True,search_inScheme=True):
172    '''Parse and print JSON output from HTTP SPARQL query
173   
174    Optionally search and display identical concepts (using owl:sameAs)
175    Optionally search and display membership in collections (using skos:inScheme)
176    '''
177    num_matches = number_of_matches(results)
178    if num_matches > 0:
179        print "-- Search results --"
180        print
181    subject = None
182    for result in results["results"]["bindings"]:
183        subject =  result["s"]["value"]
184        object =  result["o"]["value"]
185       
186        print object.encode("utf-8")
187        print subject
188       
189        if predicate == ANY_PREDICATE: # Special case
190            predicate_value = _strip_prefix( result["p"]["value"])
191        else:
192            predicate_value = predicate
193        try : # Display language tag if there is one
194            language = result["o"]["xml:lang"]
195            print "Predicate: " + predicate_value + "@" + language
196        except:
197            print "Predicate: " + predicate_value
198
199        # Optionally display identical concepts using owl:sameAs
200        if search_sameAs:
201            query,alternatives = get_sameAs(subject)
202            for alt in alternatives["results"]["bindings"]:
203                v = alt["s"]["value"]
204                print "Alias: "+v
205        # Optionally display collection membership úsing skos:inScheme
206        if search_inScheme:
207            query,schemes = get_inScheme(subject)
208            for scheme in schemes["results"]["bindings"]:
209                v = scheme["o"]["value"]
210                print "Collection: "+v
211        print
212    print "Number of matches: " + str(num_matches)
213    print
214
215def _get_last_result(results):
216    for result in results["results"]["bindings"]:
217        subject =  result["s"]["value"]
218    return subject
219
220def _printheader(label,match_from_start,predicate,case_sensitive):
221    print
222    print "=== Search KMM Classmaster (RDF triple store) ==="
223    print "Current server: " + DEFAULT_SERVER
224    if predicate == ANY_PREDICATE:
225        print "Match all predicates "
226    else:
227        print "Filter: "+predicate 
228    print "Match start of string: " + str(match_from_start)
229    print "Case sensitive: " + str(case_sensitive)
230    print "Search for: "+unicode_escape_to_sw(label) #+label.encode("utf-8")
231    print
232
233def sw_to_unicode_escape(label):
234    '''Silly fix to setup unicode escapes for HTTP call'''
235    label = label.replace("Ã¥","\\u00E5")
236    label = label.replace("À","\\u00E4")
237    label = label.replace("ö","\\u00F6")
238    label = label.replace("Ã
239","\\u00C5")
240    label = label.replace("Ä","\\u00C4")
241    label = label.replace("Ö","\\u00D6")
242    return label
243
244def unicode_escape_to_sw(label):
245    '''Silly fix to setup unicode escapes for HTTP call'''
246    label = label.replace("\\u00E5","Ã¥")
247    label = label.replace("\\u00E4","À")
248    label = label.replace("\\u00F6","ö")
249    label = label.replace("\\u00C5","Ã
250")
251    label = label.replace("\\u00C4","Ä")
252    label = label.replace("\\u00D6","Ö")
253    return label
254
255def _optparse_parse():
256    usage = "usage: %prog [options] arg"
257    parser = OptionParser(usage)
258    parser.add_option("-c", "--case_sensitive", action="store_true",
259                    dest="case_sensitive",help="Case sensitive search" )
260    parser.add_option("-e", "--everywhere", action="store_true",dest="everywhere",
261                        help="Search everywhere in text" )
262    parser.add_option("-a", "--all", action="store_true",dest="search_all",
263                        help="Match using all RDF predicates")
264    parser.add_option("-p", "--predicate", action="store", type="string", 
265                      dest="predicate",help="Match only named predicate")
266    parser.add_option("-d", "--display_alias",
267                        action="store_true",dest="display_alias",
268                        help="Display alias(es) of search result" )
269    parser.add_option("-l", "--display_collection",action="store_true",
270                        dest="display_collection",
271                        help="Display membership in collections" )
272    parser.add_option("-w", "--webbrowser", action="store_true",
273                      dest="webbrowser",
274                      help="Display result in default web browser" )
275   
276    parser.set_defaults(search_all=False)
277    parser.set_defaults(case_sensitive=False)
278    parser.set_defaults(everywhere=False)
279    parser.set_defaults(webbrowser=False)
280    parser.set_defaults(display_alias=False)
281    parser.set_defaults(display_collection=False)
282    (options, args) = parser.parse_args()
283    if len(args) != 1:
284        parser.print_help()
285        print
286        parser.error("What do you want to search for?")
287    return options,args
288
289def main():
290    options,args = _optparse_parse() # Parse command line arguments from user
291    if options.search_all:
292        options.predicate = ANY_PREDICATE
293    elif options.predicate:
294        pass # User argument is already set
295    else:
296        options.predicate = DEFAULT_PREDICATE #"skos:prefLabel"
297    match_from_start = not options.everywhere #Search from beginning of string?
298    search_term = sw_to_unicode_escape(args[0]) #Fix some non-ascii characters
299   
300    # Search and display results
301    _printheader(search_term,
302                 match_from_start,
303                 options.predicate,
304                 options.case_sensitive)
305    query, results = get_triples(search_term,
306                                match_from_start,
307                                options.predicate,
308                                options.case_sensitive)
309    _printresults(results,
310                  options.predicate,
311                  options.display_alias,options.
312                  display_collection)
313   
314    # Optionally display last selected subject in web browser
315    if number_of_matches(results) and options.webbrowser > 0:
316        show_in_browser(EXPLORE_URL+_get_last_result(results)+">")
317
318if __name__ == "__main__":
319     main()