OntoSim 2.1

fr.inrialpes.exmo.ontosim.string
Class JWNLDistances

java.lang.Object
  extended by fr.inrialpes.exmo.ontosim.string.JWNLDistances

public class JWNLDistances
extends Object

Compute a string distance using the JWNL API (WordNet API) and sometime Lucene

Version:
$Id: JWNLDistances.java 97 2010-08-31 15:06:37Z euzenat $
Author:
Jerome Pierson, David Loup, Petko Valtchev, Jerome Euzenat

Field Summary
static double ADJ_WEIGHT
           
private  Hashtable adjectives1
           
private  Hashtable adjectives2
           
(package private)  double[][] adjectivesMasks
           
(package private)  double[][] adjectivesResults
           
protected  WeakHashMap cache
          Provides the oportunity to cache pretreatments in measures which require them Using this requires to set up initPreCache() before using the cache and please cleanPreCache() after.
private static net.didion.jwnl.dictionary.Dictionary dictionary
           
private static double MINIMUM_DISTANCE
           
static double NOUN_WEIGHT
           
private  Hashtable nouns1
           
private  Hashtable nouns2
           
(package private)  double[][] nounsMasks
           
(package private)  double[][] nounsResults
           
private static Set<String> stopWords
           
static double VERB_WEIGHT
           
private  Hashtable verbs1
           
private  Hashtable verbs2
           
(package private)  double[][] verbsMasks
           
(package private)  double[][] verbsResults
           
 
Constructor Summary
JWNLDistances()
           
 
Method Summary
 double basicGlossOverlap(String s1, String s2)
          Compute the overlap between all glosses of two strings
 double basicSynonymDistance(String s1, String s2)
          Compute a basic distance between 2 strings using WordNet synonym.
 double basicSynonymySimilarity(String s1, String s2)
          Evaluate if two terms can be synonym
private  double bestMatch(double[][] matrix)
           
 void cleanPreCache()
           
 double compareComponentNames(String s1, String s2)
           
protected  Collection<String> computeGlossValue(String s)
          Cache method for glosses
 double computeSimilarity(String s1, String s2)
          This is an elaborate similarity based on WordNet It is assumed to assess the similarity based on a decomposition and parsing of the strings.
protected  Set<net.didion.jwnl.data.Synset> computeSynsets(String s)
          Cache method for synsets
 double computeTokenSimilarity(net.didion.jwnl.data.IndexWord index1, net.didion.jwnl.data.IndexWord index2)
           
 double cosynonymySimilarity(String s1, String s2)
          Compute the proportion of common synset between two words
 void display(net.didion.jwnl.data.Synset syn)
           
 void displayMatrix(double[][] matrix)
           
 void fillWithOnes(double[][] matrix)
           
 double findMatchForAdj(net.didion.jwnl.data.IndexWord index1, net.didion.jwnl.data.IndexWord index2)
           
 double[][] getAdjectivesResults()
           
(package private)  Set<net.didion.jwnl.data.Synset> getAllSenses(String term)
          Retrieve all WordNet senses of a term
 int getCommonConcepts(net.didion.jwnl.data.list.PointerTargetNodeList list1, net.didion.jwnl.data.list.PointerTargetNodeList list2)
           
protected  String getGlossForLabel(String s)
           
protected  String getGlossForLabel1(String s)
          Fetches all the glosses from wordnet for the given term and concatenate them (without quotations).
 double[][] getNounsResults()
           
 int getNumberOfOccurences(String token, Hashtable nouns, Hashtable adj, Hashtable verbs)
           
 int getNumberOfOccurences(String token, int n)
           
 double[][] getVerbsResults()
           
 void Initialize()
          Initialize the JWNL API.
 void Initialize(String wordnetdir, String wordnetversion)
           
 void initPreCache()
           
 Set<String> loadStopWordsFromFile(String filename)
          Reads a file containing one stopword per line Returns these stop words as a set of strings Set the defaults stopWords with this list
 void lookUpWord(String word, Hashtable<String,net.didion.jwnl.data.IndexWord> nouns, Hashtable<String,net.didion.jwnl.data.IndexWord> adjectives, Hashtable<String,net.didion.jwnl.data.IndexWord> verbs)
           
protected  String splitStringForWordNet(String s)
          Retains only strings made of lowercase/uppercase characters Suppress numbers Split strings when they contain LowercaseUppercase "/" ":" "_" "\" "+" "." "*" "&" But not "-" or "@" taken into account by Lucene
protected  Set<String> tokenizeGloss(String s)
          Takes a gloss-like string (text) and returns it tokenized.
 double wuPalmerSimilarity(String s1, String s2)
          Compute the Wu-Palmer similarity defined by score = 2*depth(lcs(s1,s2)) / (depth(s1) + depth(s2))
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

NOUN_WEIGHT

public static final double NOUN_WEIGHT
See Also:
Constant Field Values

ADJ_WEIGHT

public static final double ADJ_WEIGHT
See Also:
Constant Field Values

VERB_WEIGHT

public static final double VERB_WEIGHT
See Also:
Constant Field Values

MINIMUM_DISTANCE

private static final double MINIMUM_DISTANCE
See Also:
Constant Field Values

dictionary

private static net.didion.jwnl.dictionary.Dictionary dictionary

stopWords

private static Set<String> stopWords

nounsResults

double[][] nounsResults

verbsResults

double[][] verbsResults

adjectivesResults

double[][] adjectivesResults

nounsMasks

double[][] nounsMasks

verbsMasks

double[][] verbsMasks

adjectivesMasks

double[][] adjectivesMasks

nouns1

private Hashtable nouns1

adjectives1

private Hashtable adjectives1

verbs1

private Hashtable verbs1

nouns2

private Hashtable nouns2

adjectives2

private Hashtable adjectives2

verbs2

private Hashtable verbs2

cache

protected WeakHashMap cache
Provides the oportunity to cache pretreatments in measures which require them Using this requires to set up initPreCache() before using the cache and please cleanPreCache() after. This can only improve the performances.

Constructor Detail

JWNLDistances

public JWNLDistances()
Method Detail

Initialize

public void Initialize()
                throws OntoSimException
Initialize the JWNL API. Must be done one time before computing distance Need to configure the file_properties.xml located in the current directory

Throws:
OntoSimException

Initialize

public void Initialize(String wordnetdir,
                       String wordnetversion)
                throws OntoSimException
Throws:
OntoSimException

initPreCache

public void initPreCache()

cleanPreCache

public void cleanPreCache()

loadStopWordsFromFile

public Set<String> loadStopWordsFromFile(String filename)
                                  throws IOException,
                                         FileNotFoundException
Reads a file containing one stopword per line Returns these stop words as a set of strings Set the defaults stopWords with this list

Throws:
IOException
FileNotFoundException

basicSynonymDistance

public double basicSynonymDistance(String s1,
                                   String s2)
Compute a basic distance between 2 strings using WordNet synonym.

Parameters:
s1 -
s2 -
Returns:
Distance between s1 & s2 (return 1 if s2 is a synonym of s1, else return a BasicStringDistance between s1 & s2)

getAllSenses

Set<net.didion.jwnl.data.Synset> getAllSenses(String term)
                                        throws OntoSimException
Retrieve all WordNet senses of a term

Parameters:
term -
Returns:
the set of senses of term
Throws:
OntoSimException

computeSynsets

protected Set<net.didion.jwnl.data.Synset> computeSynsets(String s)
                                                   throws OntoSimException
Cache method for synsets

Throws:
OntoSimException

cosynonymySimilarity

public double cosynonymySimilarity(String s1,
                                   String s2)
                            throws OntoSimException
Compute the proportion of common synset between two words

Parameters:
s1 - a String
s2 - a String
Returns:
the proportion of common synonyms shared by both terms
Throws:
OntoSimException

basicSynonymySimilarity

public double basicSynonymySimilarity(String s1,
                                      String s2)
                               throws OntoSimException
Evaluate if two terms can be synonym

Parameters:
s1 - a String
s2 - a String
Returns:
1 if strings are equal or s2 is a synonym of s1 Note this is asymmetric: it is assumed that WordNet is symmetric
Throws:
OntoSimException

basicGlossOverlap

public double basicGlossOverlap(String s1,
                                String s2)
                         throws OntoSimException
Compute the overlap between all glosses of two strings

Parameters:
s1 - a String
s2 - a String
Returns:
a measure of overlap of their glosses in WordNet based on the following treatments: - take gloss for all senses and add the term name; - suppress quotations ('...'); - suppress empty words (or, and, the, a, an, for, of, etc.); [- suppress technical vocabulary, e.g., 'term';] [- suppress empty phrases, e.g., 'usually including';] - keep categories, e.g., law; - stem words. The results are sets (not bags, so there is no repetition) of words and compared with: | g(t1) \cap g(t2) | / | g(t1) \cup g(t2) |
Throws:
OntoSimException

computeGlossValue

protected Collection<String> computeGlossValue(String s)
                                        throws OntoSimException
Cache method for glosses

Throws:
OntoSimException

tokenizeGloss

protected Set<String> tokenizeGloss(String s)
                             throws IOException
Takes a gloss-like string (text) and returns it tokenized. with: - stopwords - lower case - porter stemmer

Throws:
IOException

getGlossForLabel

protected String getGlossForLabel(String s)

getGlossForLabel1

protected String getGlossForLabel1(String s)
Fetches all the glosses from wordnet for the given term and concatenate them (without quotations).


splitStringForWordNet

protected String splitStringForWordNet(String s)
Retains only strings made of lowercase/uppercase characters Suppress numbers Split strings when they contain LowercaseUppercase "/" ":" "_" "\" "+" "." "*" "&" But not "-" or "@" taken into account by Lucene


wuPalmerSimilarity

public double wuPalmerSimilarity(String s1,
                                 String s2)
                          throws OntoSimException
Compute the Wu-Palmer similarity defined by score = 2*depth(lcs(s1,s2)) / (depth(s1) + depth(s2))

Parameters:
s1 -
s2 -
Returns:
the Wu-Palmer similarity The algorithm returns the best Wu-Palmer similarity among the pairs of synsets corresponding to s1 and s2 Assumption: JE**1: root is when no hypernyms exists... Sketch: 1) full depth-first search from s1 with record shortest path distance from s1 and depth 2) depth-first search from s2 until reached lcs with record the best Wu-Palmer NOTE: The first phase (on s1) is a preprocessing step. In the case when the user want to compute a whole Wu-Palmer matrix, this step is made |s2| times: it may be worth caching this step
Throws:
OntoSimException

computeSimilarity

public double computeSimilarity(String s1,
                                String s2)
This is an elaborate similarity based on WordNet It is assumed to assess the similarity based on a decomposition and parsing of the strings.


compareComponentNames

public double compareComponentNames(String s1,
                                    String s2)

computeTokenSimilarity

public double computeTokenSimilarity(net.didion.jwnl.data.IndexWord index1,
                                     net.didion.jwnl.data.IndexWord index2)

findMatchForAdj

public double findMatchForAdj(net.didion.jwnl.data.IndexWord index1,
                              net.didion.jwnl.data.IndexWord index2)

lookUpWord

public void lookUpWord(String word,
                       Hashtable<String,net.didion.jwnl.data.IndexWord> nouns,
                       Hashtable<String,net.didion.jwnl.data.IndexWord> adjectives,
                       Hashtable<String,net.didion.jwnl.data.IndexWord> verbs)
Parameters:
word -

display

public void display(net.didion.jwnl.data.Synset syn)

getCommonConcepts

public int getCommonConcepts(net.didion.jwnl.data.list.PointerTargetNodeList list1,
                             net.didion.jwnl.data.list.PointerTargetNodeList list2)

bestMatch

private double bestMatch(double[][] matrix)

getNumberOfOccurences

public int getNumberOfOccurences(String token,
                                 int n)
Parameters:
token - A token.
n - The number of the ontology (typically 1 or 2).
Returns:
the number of occurences of the token in the hashtables nouns, adjectives and verbs.

getNumberOfOccurences

public int getNumberOfOccurences(String token,
                                 Hashtable nouns,
                                 Hashtable adj,
                                 Hashtable verbs)

displayMatrix

public void displayMatrix(double[][] matrix)

fillWithOnes

public void fillWithOnes(double[][] matrix)

getAdjectivesResults

public double[][] getAdjectivesResults()

getNounsResults

public double[][] getNounsResults()

getVerbsResults

public double[][] getVerbsResults()

OntoSim 2.1

(C) INRIA & friends, 2008-2009