# -*- coding: utf-8 -*-
"""
Copyright: 2008, KMM Knowledge Management in Museums (www.museum24.se)
KMM is owned by Lulea University of Technology (www.ltu.se)
Manager of KMM is Ann.Hagerfors@ltu.se

License: GPL
Author:  jonas.beckman@kamidev.net
        
        Simple search against RDF data on remote SPARQL endpoints. 
    
        SPARQL is a query language for RDF data stored as triples (subject,
        predicate, object). See SPARQL FAQ for more background:
        http://thefigtrees.net/lee/sw/sparql-faq, 
        
        Currently this script is setup to search SKOS data. Read more about 
        SKOS here: http://www.w3.org/TR/skos-primer/.
        
        Code for HTTP queries is built on top of sparql-wrapper 1.1:
        http://sparql-wrapper.sourceforge.net/
        
        XML/RDF processing uses rdflib 2.4.1.dev-r1438. Download here:
        http://rdflib.net/ (or use "easy_install -U "rdflib>=2.4,<=3.0a")
        
        Query results are returned as JSON by default and parsed with 
        Simplejson: http://pypi.python.org/pypi/simplejson 
        (or "easy_install simplejson")
        
Date:   2008-04-20  1.0 looks up KMM online data on current test server
        2008-04-26  1.1 better search arguments. Data still on test server
        2008-05-05  2.0 renamed to skosfind.py. More options, Swedish characters
        2008-05-11  2.1 simplified API using optparse library

Sample usage:

>>   python skosfind.py -h
Usage: skosfind.py [options] arg

Options:
  -h, --help            show this help message and exit
  -c, --case_sensitive  Case sensitive search
  -e, --everywhere      Search everywhere in text
  -a, --all             Match using all RDF predicates
  -p PREDICATE, --predicate=PREDICATE
                        Match only named predicate
  -d, --display_alias   Display alias(es) of search result
  -l, --display_collection
                        Display membership in collections
  -w, --webbrowser      Display result in default web browser

>>  python skosfind.py bly

=== Search KMM Classmaster (RDF triple store) ===
Current server: http://195.67.120.22:8080/openrdf-sesame/repositories/KMM_ALL
Case sensitive: False
Match beginning of string: True
Match predicate: skos:prefLabel
Search for: bly

-- Results --

blygruva(-or)
http://www.geonames.org/ontology#S.MNPB
Predicate: skos:prefLabel@sv

bly
http://classmaster.museum24.se/concepts/00000119cfa3fc218b3efd9a007f000000010001
Predicate: skos:prefLabel@sv

blyerts
http://classmaster.museum24.se/concepts/00000119cfa3fc218b3efd9b007f000000010001
Predicate: skos:prefLabel@sv

Blysigill
http://classmaster.museum24.se/concepts/00000119cfa3fa0cdf666f93007f000000010001
Predicate: skos:prefLabel@sv

Number of matches: 4

"""
import sys
import time
import urllib
import webbrowser
from optparse import OptionParser
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Literal
import re

DEFAULT_REPO = "KMM_ALL" # Current Classmaster test data merged into one repo
DEFAULT_SERVER="http://195.67.120.22:8080/openrdf-sesame/repositories/"+ \
              DEFAULT_REPO

WORKBENCH = "http://195.67.120.22:8080/openrdf-workbench/repository/"
EXPLORE_URL = WORKBENCH + "explore/resource.view?resource=<"
VIEW_REPO_URL = WORKBENCH + "overview.view?id="

SKOS_CORE = "http://www.w3.org/2004/02/skos/core#"
OWL_NAMESPACE = "http://www.w3.org/2002/07/owl#"
RDF_SYNTAX = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
ANY_PREDICATE = "?p"
DEFAULT_PREDICATE = "skos:prefLabel"

def get_triples(search_term,match_from_start, predicate,case_sensitive):
    """Find triples (subject, predicate, object) matching searchstring
    
    match_from_start: match only text at beginning of string literals.
    predicate: match named predicate. Use ANY_PREDICATE
    case_sensitive: True or False
    """
    sparql = SPARQLWrapper(DEFAULT_SERVER)    
    regex_start = '''FILTER(regex(str(?o),"'''      # Find anywhere in string
    if match_from_start:
        regex_start = '''FILTER(regex(str(?o),"^''' # Match start of string
    if case_sensitive: # This is Sesame and SPARQL default
        regex_end = '''")).}'''
    else:
        regex_end = '''", "i")).}''' # case-insenstive 'i' flag
    query = '''PREFIX skos:<'''+SKOS_CORE+'''> SELECT ?s ?p ?o  WHERE {?s '''+ predicate +''' ?o. '''+regex_start+ search_term + regex_end
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return query,results

def get_sameAs(uri,server=DEFAULT_SERVER):
    '''For a given uri, find identical concepts using owl:sameAs '''
    
    sparql = SPARQLWrapper(server)
    filter_start = '''FILTER(str(?o) = "''' # string starts with
    
    query = '''PREFIX skos:<'''+SKOS_CORE+'''>
    PREFIX owl:<'''+OWL_NAMESPACE+'''>
    SELECT DISTINCT ?s  WHERE {?s owl:sameAs ?o. FILTER(str(?o) ="'''+ uri +'''").}'''
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return query,results

def get_inScheme(uri,server=DEFAULT_SERVER):
    '''List all SKOS conceptschemes a given uri belongs to'''
    
    sparql = SPARQLWrapper(server)
    filter_start = '''FILTER(str(?o) = "''' # string starts with
    
    query = '''PREFIX skos:<'''+SKOS_CORE+'''>
    PREFIX owl:<'''+OWL_NAMESPACE+'''>
    SELECT DISTINCT ?o  WHERE {?s skos:inScheme ?o. FILTER(str(?s) ="'''+ uri +'''").}'''
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return query,results

def _strip_prefix(string,prefix = SKOS_CORE):
    start = len(prefix)
    return string[start:]

def show_in_browser(uri):
    '''Show URI in default web browser.
    
    First select and open the right repository.
    Then display results for uri in the same browser window
    '''
    webbrowser.open(VIEW_REPO_URL+DEFAULT_REPO)
    time.sleep(3) # Wait to make sure the right repository is opened first
    webbrowser.open(uri,new=0) # open in same window if possible"

def number_of_matches(results):
    '''Parse JSON output and return number of matches'''
    return len(results["results"]["bindings"])

def _printresults(results,predicate,search_sameAs=True,search_inScheme=True):
    '''Parse and print JSON output from HTTP SPARQL query
    
    Optionally search and display identical concepts (using owl:sameAs)
    Optionally search and display membership in collections (using skos:inScheme)
    '''
    num_matches = number_of_matches(results)
    if num_matches > 0:
        print "-- Search results --"
        print
    subject = None
    for result in results["results"]["bindings"]:
        subject =  result["s"]["value"]
        object =  result["o"]["value"]
        
        print object.encode("utf-8")
        print subject
        
        if predicate == ANY_PREDICATE: # Special case
            predicate_value = _strip_prefix( result["p"]["value"])
        else:
            predicate_value = predicate
        try : # Display language tag if there is one
            language = result["o"]["xml:lang"]
            print "Predicate: " + predicate_value + "@" + language
        except:
            print "Predicate: " + predicate_value

        # Optionally display identical concepts using owl:sameAs
        if search_sameAs:
            query,alternatives = get_sameAs(subject)
            for alt in alternatives["results"]["bindings"]:
                v = alt["s"]["value"]
                print "Alias: "+v
        # Optionally display collection membership úsing skos:inScheme
        if search_inScheme:
            query,schemes = get_inScheme(subject)
            for scheme in schemes["results"]["bindings"]:
                v = scheme["o"]["value"]
                print "Collection: "+v
        print
    print "Number of matches: " + str(num_matches)
    print

def _get_last_result(results):
    for result in results["results"]["bindings"]:
        subject =  result["s"]["value"]
    return subject

def _printheader(label,match_from_start,predicate,case_sensitive):
    print
    print "=== Search KMM Classmaster (RDF triple store) ==="
    print "Current server: " + DEFAULT_SERVER
    if predicate == ANY_PREDICATE:
        print "Match all predicates "
    else:
        print "Filter: "+predicate 
    print "Match start of string: " + str(match_from_start)
    print "Case sensitive: " + str(case_sensitive)
    print "Search for: "+unicode_escape_to_sw(label) #+label.encode("utf-8")
    print

def sw_to_unicode_escape(label):
    '''Silly fix to setup unicode escapes for HTTP call'''
    label = label.replace("å","\\u00E5")
    label = label.replace("ä","\\u00E4")
    label = label.replace("ö","\\u00F6")
    label = label.replace("Å","\\u00C5")
    label = label.replace("Ä","\\u00C4")
    label = label.replace("Ö","\\u00D6")
    return label

def unicode_escape_to_sw(label):
    '''Silly fix to setup unicode escapes for HTTP call'''
    label = label.replace("\\u00E5","å")
    label = label.replace("\\u00E4","ä")
    label = label.replace("\\u00F6","ö")
    label = label.replace("\\u00C5","Å")
    label = label.replace("\\u00C4","Ä")
    label = label.replace("\\u00D6","Ö")
    return label

def _optparse_parse():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    parser.add_option("-c", "--case_sensitive", action="store_true",
                    dest="case_sensitive",help="Case sensitive search" )
    parser.add_option("-e", "--everywhere", action="store_true",dest="everywhere",
                        help="Search everywhere in text" )
    parser.add_option("-a", "--all", action="store_true",dest="search_all",
                        help="Match using all RDF predicates")
    parser.add_option("-p", "--predicate", action="store", type="string", 
                      dest="predicate",help="Match only named predicate")
    parser.add_option("-d", "--display_alias",
                        action="store_true",dest="display_alias",
                        help="Display alias(es) of search result" )
    parser.add_option("-l", "--display_collection",action="store_true",
                        dest="display_collection",
                        help="Display membership in collections" )
    parser.add_option("-w", "--webbrowser", action="store_true",
                      dest="webbrowser",
                      help="Display result in default web browser" )
    
    parser.set_defaults(search_all=False)
    parser.set_defaults(case_sensitive=False)
    parser.set_defaults(everywhere=False)
    parser.set_defaults(webbrowser=False)
    parser.set_defaults(display_alias=False)
    parser.set_defaults(display_collection=False)
    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.print_help()
        print
        parser.error("What do you want to search for?")
    return options,args

def main():
    options,args = _optparse_parse() # Parse command line arguments from user
    if options.search_all:
        options.predicate = ANY_PREDICATE
    elif options.predicate:
        pass # User argument is already set
    else:
        options.predicate = DEFAULT_PREDICATE #"skos:prefLabel" 
    match_from_start = not options.everywhere #Search from beginning of string?
    search_term = sw_to_unicode_escape(args[0]) #Fix some non-ascii characters
    
    # Search and display results
    _printheader(search_term,
                 match_from_start,
                 options.predicate,
                 options.case_sensitive)
    query, results = get_triples(search_term,
                                match_from_start,
                                options.predicate,
                                options.case_sensitive)
    _printresults(results,
                  options.predicate,
                  options.display_alias,options.
                  display_collection)
    
    # Optionally display last selected subject in web browser
    if number_of_matches(results) and options.webbrowser > 0:
        show_in_browser(EXPLORE_URL+_get_last_result(results)+">")

if __name__ == "__main__":
     main()

