Ticket #25: skosfind.py

File skosfind.py, 11.8 KB (added by http://jonasbeckman.myopenid.com/, 4 years ago)
Line 
1# -*- coding: utf-8 -*-
2"""
3Copyright: 2008, KMM Knowledge Management in Museums (www.museum24.se)
4KMM is owned by Lulea University of Technology (www.ltu.se)
5Manager of KMM is Ann.Hagerfors@ltu.se
6
7License: GPL
8Author:  jonas.beckman@kamidev.net
9       
10        Simple search against RDF data on remote SPARQL endpoints.
11   
12        SPARQL is a query language for RDF data stored as triples (subject,
13        predicate, object). See SPARQL FAQ for more background:
14        http://thefigtrees.net/lee/sw/sparql-faq,
15       
16        Currently this script is setup to search SKOS data. Read more about
17        SKOS here: http://www.w3.org/TR/skos-primer/.
18       
19        Code for HTTP queries is built on top of sparql-wrapper 1.1:
20        http://sparql-wrapper.sourceforge.net/
21       
22        XML/RDF processing uses rdflib 2.4.1.dev-r1438. Download here:
23        http://rdflib.net/ (or use "easy_install -U "rdflib>=2.4,<=3.0a")
24       
25        Query results are returned as JSON by default and parsed with
26        Simplejson: http://pypi.python.org/pypi/simplejson
27        (or "easy_install simplejson")
28       
29Date:   2008-04-20  1.0 looks up KMM online data on current test server
30        2008-04-26  1.1 better search arguments. Data still on test server
31        2008-05-05  2.0 renamed to skosfind.py. More options, Swedish characters
32        2008-05-11  2.1 simplified API using optparse library
33
34Sample usage:
35
36>>   python skosfind.py -h
37Usage: skosfind.py [options] arg
38
39Options:
40  -h, --help            show this help message and exit
41  -c, --case_sensitive  Case sensitive search
42  -e, --everywhere      Search everywhere in text
43  -a, --all             Match using all RDF predicates
44  -p PREDICATE, --predicate=PREDICATE
45                        Match only named predicate
46  -d, --display_alias   Display alias(es) of search result
47  -l, --display_collection
48                        Display membership in collections
49  -w, --webbrowser      Display result in default web browser
50
51>>  python skosfind.py bly
52
53=== Search KMM Classmaster (RDF triple store) ===
54Current server: http://195.67.120.22:8080/openrdf-sesame/repositories/KMM_ALL
55Case sensitive: False
56Match beginning of string: True
57Match predicate: skos:prefLabel
58Search for: bly
59
60-- Results --
61
62blygruva(-or)
63http://www.geonames.org/ontology#S.MNPB
64Predicate: skos:prefLabel@sv
65
66bly
67http://classmaster.museum24.se/concepts/00000119cfa3fc218b3efd9a007f000000010001
68Predicate: skos:prefLabel@sv
69
70blyerts
71http://classmaster.museum24.se/concepts/00000119cfa3fc218b3efd9b007f000000010001
72Predicate: skos:prefLabel@sv
73
74Blysigill
75http://classmaster.museum24.se/concepts/00000119cfa3fa0cdf666f93007f000000010001
76Predicate: skos:prefLabel@sv
77
78Number of matches: 4
79
80"""
81import sys
82import time
83import urllib
84import webbrowser
85from optparse import OptionParser
86from SPARQLWrapper import SPARQLWrapper, JSON
87from rdflib import Literal
88import re
89
90DEFAULT_REPO = "KMM_ALL" # Current Classmaster test data merged into one repo
91DEFAULT_SERVER="http://195.67.120.22:8080/openrdf-sesame/repositories/"+ \
92              DEFAULT_REPO
93
94WORKBENCH = "http://195.67.120.22:8080/openrdf-workbench/repository/"
95EXPLORE_URL = WORKBENCH + "explore/resource.view?resource=<"
96VIEW_REPO_URL = WORKBENCH + "overview.view?id="
97
98SKOS_CORE = "http://www.w3.org/2004/02/skos/core#"
99OWL_NAMESPACE = "http://www.w3.org/2002/07/owl#"
100RDF_SYNTAX = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
101ANY_PREDICATE = "?p"
102DEFAULT_PREDICATE = "skos:prefLabel"
103
104def get_triples(search_term,match_from_start, predicate,case_sensitive):
105    """Find triples (subject, predicate, object) matching searchstring
106   
107    match_from_start: match only text at beginning of string literals.
108    predicate: match named predicate. Use ANY_PREDICATE
109    case_sensitive: True or False
110    """
111    sparql = SPARQLWrapper(DEFAULT_SERVER)   
112    regex_start = '''FILTER(regex(str(?o),"'''      # Find anywhere in string
113    if match_from_start:
114        regex_start = '''FILTER(regex(str(?o),"^''' # Match start of string
115    if case_sensitive: # This is Sesame and SPARQL default
116        regex_end = '''")).}'''
117    else:
118        regex_end = '''", "i")).}''' # case-insenstive 'i' flag
119    query = '''PREFIX skos:<'''+SKOS_CORE+'''> SELECT ?s ?p ?o  WHERE {?s '''+ predicate +''' ?o. '''+regex_start+ search_term + regex_end
120    sparql.setQuery(query)
121    sparql.setReturnFormat(JSON)
122    results = sparql.query().convert()
123    return query,results
124
125def get_sameAs(uri,server=DEFAULT_SERVER):
126    '''For a given uri, find identical concepts using owl:sameAs '''
127   
128    sparql = SPARQLWrapper(server)
129    filter_start = '''FILTER(str(?o) = "''' # string starts with
130   
131    query = '''PREFIX skos:<'''+SKOS_CORE+'''>
132    PREFIX owl:<'''+OWL_NAMESPACE+'''>
133    SELECT DISTINCT ?s  WHERE {?s owl:sameAs ?o. FILTER(str(?o) ="'''+ uri +'''").}'''
134    sparql.setQuery(query)
135    sparql.setReturnFormat(JSON)
136    results = sparql.query().convert()
137    return query,results
138
139def get_inScheme(uri,server=DEFAULT_SERVER):
140    '''List all SKOS conceptschemes a given uri belongs to'''
141   
142    sparql = SPARQLWrapper(server)
143    filter_start = '''FILTER(str(?o) = "''' # string starts with
144   
145    query = '''PREFIX skos:<'''+SKOS_CORE+'''>
146    PREFIX owl:<'''+OWL_NAMESPACE+'''>
147    SELECT DISTINCT ?o  WHERE {?s skos:inScheme ?o. FILTER(str(?s) ="'''+ uri +'''").}'''
148    sparql.setQuery(query)
149    sparql.setReturnFormat(JSON)
150    results = sparql.query().convert()
151    return query,results
152
153def _strip_prefix(string,prefix = SKOS_CORE):
154    start = len(prefix)
155    return string[start:]
156
157def show_in_browser(uri):
158    '''Show URI in default web browser.
159   
160    First select and open the right repository.
161    Then display results for uri in the same browser window
162    '''
163    webbrowser.open(VIEW_REPO_URL+DEFAULT_REPO)
164    time.sleep(3) # Wait to make sure the right repository is opened first
165    webbrowser.open(uri,new=0) # open in same window if possible"
166
167def number_of_matches(results):
168    '''Parse JSON output and return number of matches'''
169    return len(results["results"]["bindings"])
170
171def _printresults(results,predicate,search_sameAs=True,search_inScheme=True):
172    '''Parse and print JSON output from HTTP SPARQL query
173   
174    Optionally search and display identical concepts (using owl:sameAs)
175    Optionally search and display membership in collections (using skos:inScheme)
176    '''
177    num_matches = number_of_matches(results)
178    if num_matches > 0:
179        print "-- Search results --"
180        print
181    subject = None
182    for result in results["results"]["bindings"]:
183        subject =  result["s"]["value"]
184        object =  result["o"]["value"]
185       
186        print object.encode("utf-8")
187        print subject
188       
189        if predicate == ANY_PREDICATE: # Special case
190            predicate_value = _strip_prefix( result["p"]["value"])
191        else:
192            predicate_value = predicate
193        try : # Display language tag if there is one
194            language = result["o"]["xml:lang"]
195            print "Predicate: " + predicate_value + "@" + language
196        except:
197            print "Predicate: " + predicate_value
198
199        # Optionally display identical concepts using owl:sameAs
200        if search_sameAs:
201            query,alternatives = get_sameAs(subject)
202            for alt in alternatives["results"]["bindings"]:
203                v = alt["s"]["value"]
204                print "Alias: "+v
205        # Optionally display collection membership úsing skos:inScheme
206        if search_inScheme:
207            query,schemes = get_inScheme(subject)
208            for scheme in schemes["results"]["bindings"]:
209                v = scheme["o"]["value"]
210                print "Collection: "+v
211        print
212    print "Number of matches: " + str(num_matches)
213    print
214
215def _get_last_result(results):
216    for result in results["results"]["bindings"]:
217        subject =  result["s"]["value"]
218    return subject
219
220def _printheader(label,match_from_start,predicate,case_sensitive):
221    print
222    print "=== Search KMM Classmaster (RDF triple store) ==="
223    print "Current server: " + DEFAULT_SERVER
224    if predicate == ANY_PREDICATE:
225        print "Match all predicates "
226    else:
227        print "Filter: "+predicate
228    print "Match start of string: " + str(match_from_start)
229    print "Case sensitive: " + str(case_sensitive)
230    print "Search for: "+unicode_escape_to_sw(label) #+label.encode("utf-8")
231    print
232
233def sw_to_unicode_escape(label):
234    '''Silly fix to setup unicode escapes for HTTP call'''
235    label = label.replace("Ã¥","\\u00E5")
236    label = label.replace("À","\\u00E4")
237    label = label.replace("ö","\\u00F6")
238    label = label.replace("Å","\\u00C5")
239    label = label.replace("Ä","\\u00C4")
240    label = label.replace("Ö","\\u00D6")
241    return label
242
243def unicode_escape_to_sw(label):
244    '''Silly fix to setup unicode escapes for HTTP call'''
245    label = label.replace("\\u00E5","Ã¥")
246    label = label.replace("\\u00E4","À")
247    label = label.replace("\\u00F6","ö")
248    label = label.replace("\\u00C5","Å")
249    label = label.replace("\\u00C4","Ä")
250    label = label.replace("\\u00D6","Ö")
251    return label
252
253def _optparse_parse():
254    usage = "usage: %prog [options] arg"
255    parser = OptionParser(usage)
256    parser.add_option("-c", "--case_sensitive", action="store_true",
257                    dest="case_sensitive",help="Case sensitive search" )
258    parser.add_option("-e", "--everywhere", action="store_true",dest="everywhere",
259                        help="Search everywhere in text" )
260    parser.add_option("-a", "--all", action="store_true",dest="search_all",
261                        help="Match using all RDF predicates")
262    parser.add_option("-p", "--predicate", action="store", type="string", 
263                      dest="predicate",help="Match only named predicate")
264    parser.add_option("-d", "--display_alias",
265                        action="store_true",dest="display_alias",
266                        help="Display alias(es) of search result" )
267    parser.add_option("-l", "--display_collection",action="store_true",
268                        dest="display_collection",
269                        help="Display membership in collections" )
270    parser.add_option("-w", "--webbrowser", action="store_true",
271                      dest="webbrowser",
272                      help="Display result in default web browser" )
273   
274    parser.set_defaults(search_all=False)
275    parser.set_defaults(case_sensitive=False)
276    parser.set_defaults(everywhere=False)
277    parser.set_defaults(webbrowser=False)
278    parser.set_defaults(display_alias=False)
279    parser.set_defaults(display_collection=False)
280    (options, args) = parser.parse_args()
281    if len(args) != 1:
282        parser.print_help()
283        print
284        parser.error("What do you want to search for?")
285    return options,args
286
287def main():
288    options,args = _optparse_parse() # Parse command line arguments from user
289    if options.search_all:
290        options.predicate = ANY_PREDICATE
291    elif options.predicate:
292        pass # User argument is already set
293    else:
294        options.predicate = DEFAULT_PREDICATE #"skos:prefLabel"
295    match_from_start = not options.everywhere #Search from beginning of string?
296    search_term = sw_to_unicode_escape(args[0]) #Fix some non-ascii characters
297   
298    # Search and display results
299    _printheader(search_term,
300                 match_from_start,
301                 options.predicate,
302                 options.case_sensitive)
303    query, results = get_triples(search_term,
304                                match_from_start,
305                                options.predicate,
306                                options.case_sensitive)
307    _printresults(results,
308                  options.predicate,
309                  options.display_alias,options.
310                  display_collection)
311   
312    # Optionally display last selected subject in web browser
313    if number_of_matches(results) and options.webbrowser > 0:
314        show_in_browser(EXPLORE_URL+_get_last_result(results)+">")
315
316if __name__ == "__main__":
317     main()