Index: contrib/analyzers/src/test/org/apache/lucene/analysis/fr/TestElision.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/fr/TestElision.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/fr/TestElision.java (revision 0) @@ -0,0 +1,56 @@ +/** + * + */ +package org.apache.lucene.analysis.fr; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * @author Mathieu Lecarme + * + */ +public class TestElision extends TestCase{ + + public void testElision(){ + String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; + Tokenizer tokenizer = new StandardTokenizer(new StringReader(test)); + Set articles = new HashSet(); + articles.add("l"); + articles.add("M"); + TokenFilter filter = new ElisionFilter(tokenizer, articles); + List tas = filtre(filter); + assertEquals("embrouille", tas.get(4)); + assertEquals("O'brian", tas.get(6)); + assertEquals("enfin", tas.get(7)); + } + + private List filtre(TokenFilter filter) { + List tas = new ArrayList(); + try { + boolean encore = true; + Token token; + while(encore){ + token = filter.next(); + encore = token != null; + if(token != null) + tas.add(token.termText()); + } + } catch (IOException e) { + e.printStackTrace(); + } + return tas; + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 0) @@ -0,0 +1,79 @@ +/** + * + */ +package org.apache.lucene.analysis.fr; + +import java.io.IOException; +import java.util.Set; +import java.util.HashSet; +import java.util.Arrays; +import java.util.Iterator; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenFilter; + +/** + * Removes elisions a token stream. + * For example, "l'avion" (the plane) will be tokenized as "avion" (plane) + * @author Mathieu Lecarme + * @see{http://fr.wikipedia.org/wiki/%C3%89lision} + * Be careful, StandardTokenizer see "’" as a space, and cut on it. + */ +public class ElisionFilter extends TokenFilter { + private Set articles = null; + private static String apostrophes = "'’"; + + public void setArticles(Set articles) { + this.articles = new HashSet(); + Iterator iter = articles.iterator(); + while(iter.hasNext()) { + this.articles.add(((String)iter.next()).toLowerCase()); + } + } + + /** + * Construct an elision filter with standard stop words + */ + protected ElisionFilter(TokenStream input) { + super(input); + this.articles = new HashSet(Arrays.asList(new String[] {"l","m","t","qu","n","s","j"})); + } + + /** + * Construct an elision filter with a Set of stop words + */ + public ElisionFilter(TokenStream input, Set articles) { + super(input); + setArticles(articles); + } + + /** + * Construct an elision filter with an arrays stop words + */ + public ElisionFilter(TokenStream input, String[] articles) { + super(input); + setArticles(new HashSet(Arrays.asList(articles))); + } + + /** + * Returns the next input Token whith termText() without elisioned start + */ + public Token next() throws IOException { + Token t = input.next(); + if (t == null) + return null; + String text = t.termText(); + System.out.println(text); + int minPoz = -1; + int poz; + for(int i=0; i< apostrophes.length(); i++) { + poz = text.indexOf(apostrophes.charAt(i)); + if(poz != -1) + minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz); + } + if(minPoz != -1 && articles.contains(text.substring(0, minPoz).toLowerCase())) + text = text.substring(minPoz+1); + return new Token(text, t.startOffset(), t.endOffset(), t.type()); + } + +}