Index: contrib/elision/src/test/org/apache/lucene/analysis/TestElision.java =================================================================== --- contrib/elision/src/test/org/apache/lucene/analysis/TestElision.java (revision 0) +++ contrib/elision/src/test/org/apache/lucene/analysis/TestElision.java (revision 0) @@ -0,0 +1,54 @@ +/** + * + */ +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * @author Mathieu Lecarme + * + */ +public class TestElision extends TestCase{ + + public void testElision(){ + String test = "Plop, juste pour voir l'embrouille avec O'brian"; + Tokenizer tokenizer = new StandardTokenizer(new StringReader(test)); + Set stops = new HashSet(); + stops.add("l"); + TokenFilter filter = new ElisionFilter(tokenizer, stops); + List tas = filtre(filter); + assertEquals("embrouille", tas.get(4)); + assertEquals("O'brian", tas.get(6)); + } + + private List filtre(TokenFilter filter) { + List tas = new ArrayList(); + try { + boolean encore = true; + Token token; + while(encore){ + token = filter.next(); + encore = token != null; + if(token != null) + tas.add(token.termText()); + } + } catch (IOException e) { + e.printStackTrace(); + } + return tas; + } + +} Index: contrib/elision/src/java/org/apache/lucene/analysis/ElisionFilter.java =================================================================== --- contrib/elision/src/java/org/apache/lucene/analysis/ElisionFilter.java (revision 0) +++ contrib/elision/src/java/org/apache/lucene/analysis/ElisionFilter.java (revision 0) @@ -0,0 +1,58 @@ +/** + * + */ +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.Set; +import java.util.HashSet; +import java.util.Arrays; + +/** + * Removes elisions a token stream. + * For example, "l'avion" (the plane) will be tokenized as "avion" (plane) + * @author Mathieu Lecarme + * @see{http://fr.wikipedia.org/wiki/%C3%89lision} + */ +public class ElisionFilter extends TokenFilter { + private Set stopwords = null; + + /** + * Construct an elision filter with standard stop words + */ + protected ElisionFilter(TokenStream input) { + super(input); + this.stopwords = new HashSet(Arrays.asList(new String[] {"l","m","t","qu","n","s","j"})); + } + + /** + * Construct an elision filter with a Set of stop words + */ + public ElisionFilter(TokenStream input, Set stopwords) { + super(input); + this.stopwords = stopwords; + } + + /** + * Construct an elision filter with an arrays stop words + */ + public ElisionFilter(TokenStream input, String[] stopwords) { + super(input); + this.stopwords = new HashSet(Arrays.asList(stopwords)); + } + + /** + * Returns the next input Token whith termText() without elisioned start + */ + public Token next() throws IOException { + Token t = input.next(); + if (t == null) + return null; + int poz = t.termText().indexOf('\''); + String text = t.termText(); + if(poz != -1 && stopwords.contains(t.termText().substring(0,poz).toLowerCase())) + text = text.substring(poz+1); + return new Token(text, t.startOffset(), t.endOffset(), t.type()); + } + +} Index: contrib/elision/build.xml =================================================================== --- contrib/elision/build.xml (revision 0) +++ contrib/elision/build.xml (revision 0) @@ -0,0 +1,10 @@ + + + + + + Elsion + + + + Property changes on: contrib/elision/build.xml ___________________________________________________________________ Name: svn:executable + *