Index: java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java,v retrieving revision 1.2 diff -u -r1.2 FrenchAnalyzer.java --- java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java 22 Jan 2004 20:49:16 -0000 1.2 +++ java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java 23 Jan 2004 10:14:21 -0000 @@ -1,186 +1,184 @@ -package org.apache.lucene.analysis.fr; - -/* ==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2004 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . - */ - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardFilter; -import org.apache.lucene.analysis.standard.StandardTokenizer; -import java.io.File; -import java.io.Reader; -import java.util.Hashtable; -import org.apache.lucene.analysis.de.WordlistLoader; - -/** - * Analyzer for french language. Supports an external list of stopwords (words that - * will not be indexed at all) and an external list of exclusions (word that will - * not be stemmed, but indexed). - * A default set of stopwords is used unless an other list is specified, the - * exclusionlist is empty by default. - * - * @author Patrick Talbot (based on Gerhard Schwarz work for German) - */ -public final class FrenchAnalyzer extends Analyzer { - - /** - * Extended list of typical french stopwords. - */ - private String[] FRENCH_STOP_WORDS = { - "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi", - "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir", - "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain", - "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci", - "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout", - "dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles", - "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse", - "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles", - "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis", - "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle", - "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là", - "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi", - "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre", - "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi", - "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi", - "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels", - "qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon", - "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit", - "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes", - "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers", - "voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès", - "été", "être", "ô" - }; - - /** - * Contains the stopwords used with the StopFilter. - */ - private Hashtable stoptable = new Hashtable(); - /** - * Contains words that should be indexed but not stemmed. - */ - private Hashtable excltable = new Hashtable(); - - /** - * Builds an analyzer. - */ - public FrenchAnalyzer() { - stoptable = StopFilter.makeStopTable(FRENCH_STOP_WORDS); - } - - /** - * Builds an analyzer with the given stop words. - */ - public FrenchAnalyzer(String[] stopwords) { - stoptable = StopFilter.makeStopTable(stopwords); - } - - /** - * Builds an analyzer with the given stop words. - */ - public FrenchAnalyzer(Hashtable stopwords) { - stoptable = stopwords; - } - - /** - * Builds an analyzer with the given stop words. - */ - public FrenchAnalyzer(File stopwords) { - stoptable = WordlistLoader.getWordtable(stopwords); - } - - /** - * Builds an exclusionlist from an array of Strings. - */ - public void setStemExclusionTable(String[] exclusionlist) { - excltable = StopFilter.makeStopTable(exclusionlist); - } - - /** - * Builds an exclusionlist from a Hashtable. - */ - public void setStemExclusionTable(Hashtable exclusionlist) { - excltable = exclusionlist; - } - - /** - * Builds an exclusionlist from the words contained in the given file. - */ - public void setStemExclusionTable(File exclusionlist) { - excltable = WordlistLoader.getWordtable(exclusionlist); - } - - /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. - * - * @return A TokenStream build from a StandardTokenizer filtered with - * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter - */ - public final TokenStream tokenStream(String fieldName, Reader reader) { - - if (fieldName == null) - throw new IllegalArgumentException - ("fieldName must not be null"); - if (reader == null) throw new IllegalArgumentException("reader must not be null"); - - TokenStream result = new StandardTokenizer(reader); - result = new StandardFilter(result); - result = new StopFilter(result, stoptable); - result = new FrenchStemFilter(result, excltable); - // Convert to lowercase after stemming! - result = new LowerCaseFilter(result); - return result; - } -} +package org.apache.lucene.analysis.fr; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2004 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import java.io.File; +import java.io.Reader; +import java.util.Hashtable; +import org.apache.lucene.analysis.de.WordlistLoader; + +/** + * Analyzer for french language. Supports an external list of stopwords (words that + * will not be indexed at all) and an external list of exclusions (word that will + * not be stemmed, but indexed). + * A default set of stopwords is used unless an other list is specified, the + * exclusionlist is empty by default. + * + * @author Patrick Talbot (based on Gerhard Schwarz work for German) + * @version $Id: FrenchAnalyzer.java,v 1.1 2004/01/20 10:07:01 ehatcher Exp $ + */ +public final class FrenchAnalyzer extends Analyzer { + + /** + * Extended list of typical french stopwords. + */ + private String[] FRENCH_STOP_WORDS = { + "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi", + "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir", + "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain", + "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci", + "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout", + "dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles", + "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse", + "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles", + "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis", + "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle", + "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là", + "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi", + "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre", + "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi", + "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi", + "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels", + "qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon", + "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit", + "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes", + "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers", + "voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès", + "été", "être", "ô" + }; + + /** + * Contains the stopwords used with the StopFilter. + */ + private Hashtable stoptable = new Hashtable(); + /** + * Contains words that should be indexed but not stemmed. + */ + private Hashtable excltable = new Hashtable(); + + /** + * Builds an analyzer. + */ + public FrenchAnalyzer() { + stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS ); + } + + /** + * Builds an analyzer with the given stop words. + */ + public FrenchAnalyzer( String[] stopwords ) { + stoptable = StopFilter.makeStopTable( stopwords ); + } + + /** + * Builds an analyzer with the given stop words. + */ + public FrenchAnalyzer( Hashtable stopwords ) { + stoptable = stopwords; + } + + /** + * Builds an analyzer with the given stop words. + */ + public FrenchAnalyzer( File stopwords ) { + stoptable = WordlistLoader.getWordtable( stopwords ); + } + + /** + * Builds an exclusionlist from an array of Strings. + */ + public void setStemExclusionTable( String[] exclusionlist ) { + excltable = StopFilter.makeStopTable( exclusionlist ); + } + /** + * Builds an exclusionlist from a Hashtable. + */ + public void setStemExclusionTable( Hashtable exclusionlist ) { + excltable = exclusionlist; + } + /** + * Builds an exclusionlist from the words contained in the given file. + */ + public void setStemExclusionTable( File exclusionlist ) { + excltable = WordlistLoader.getWordtable( exclusionlist ); + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter + */ + public final TokenStream tokenStream( String fieldName, Reader reader ) { + + if (fieldName==null) throw new IllegalArgumentException("fieldName must not be null"); + if (reader==null) throw new IllegalArgumentException("readermust not be null"); + + TokenStream result = new StandardTokenizer( reader ); + result = new StandardFilter( result ); + result = new StopFilter( result, stoptable ); + result = new FrenchStemFilter( result, excltable ); + // Convert to lowercase after stemming! + result = new LowerCaseFilter( result ); + return result; + } +} + Index: test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java,v retrieving revision 1.1 diff -u -r1.1 TestFrenchAnalyzer.java --- test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java 22 Jan 2004 20:49:16 -0000 1.1 +++ test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java 23 Jan 2004 10:14:22 -0000 @@ -54,6 +54,7 @@ * . */ +import java.io.Reader; import java.io.StringReader; import junit.framework.TestCase; @@ -65,126 +66,127 @@ /** * Test case for FrenchAnalyzer. * - * @author Jean-Fran̤ois Halleux + * @author Jean-François Halleux + * @version $version$ */ -public class TestFrenchAnalyzer extends TestCase { - - public void assertAnalyzesTo(Analyzer a, String input, String[] output) - throws Exception { - TokenStream ts = a.tokenStream("dummy", new StringReader - (input)); +public class TestFrenchAnalyzer extends TestCase { - for (int i = 0; i < output.length; i++) { - Token t = ts.next(); - assertNotNull(t); - assertEquals(t.termText(), output[i]); - } - assertNull(ts.next()); - ts.close(); - } - - public void testAnalyzer() throws Exception { - FrenchAnalyzer fa = new FrenchAnalyzer(); - - // test null reader - boolean iaeFlag = false; - try { - TokenStream ts = fa.tokenStream("dummy", null); - } catch (IllegalArgumentException iae) { - iaeFlag = true; - } - assertEquals(iaeFlag, true); - - // test null fieldname - iaeFlag = true; - try { - TokenStream ts = fa.tokenStream(null, new StringReader - ("dummy")); - } catch (IllegalArgumentException iae) { - iaeFlag = true; - } - assertEquals(iaeFlag, true); - - assertAnalyzesTo(fa, "", new String[]{ - }); - - assertAnalyzesTo( - fa, - "chien chat cheval", - new String[]{"chien", "chat", "cheval"}); - - assertAnalyzesTo( - fa, - "chien CHAT CHEVAL", - new String[]{"chien", "chat", "cheval"}); - - assertAnalyzesTo( - fa, - " chien ,? + = - CHAT /: > CHEVAL", - new String[]{"chien", "chat", "cheval"}); - - assertAnalyzesTo(fa, "chien++", new String[]{"chien"}); - - assertAnalyzesTo( - fa, - "mot \"entreguillemet\"", - new String[]{"mot", "entreguillemet"}); - - // let's do some french specific tests now - - // 1. couldn't resist - // I would expect this to stay one term as in French the minus sign - // is often used for composing words - assertAnalyzesTo( - fa, - "Jean-Fran̤ois", - new String[]{"jean", "fran̤ois"}); - - // 2. stopwords - assertAnalyzesTo( - fa, - "le la chien les aux chat du des ÌÊ cheval", - new String[]{"chien", "chat", "cheval"}); - - // some nouns and adjectives - assertAnalyzesTo( - fa, - "lances chismes habitable chiste Ì©lÌ©ments captifs", - new String[]{ - "lanc", - "chism", - "habit", - "chist", - "Ì©lÌ©ment", - "captif"}); - - // some verbs - assertAnalyzesTo( - fa, - "finissions souffrirent rugissante", - new String[]{"fin", "souffr", "rug"}); - - // some everything else - // aujourd'hui stays one term which is OK - assertAnalyzesTo( - fa, - "C3PO aujourd'hui oeuf ÌøÌ¢Ì¦ÌÈÌÊÌ? anticonstitutionnellement Java++", - new String[]{ - "c3po", - "aujourd'hui", - "oeuf", - "ÌøÌ¢Ì¦ÌÈÌÊÌ?", - "anticonstitutionnel", - "jav"}); - - // some more everything else - // here 1940-1945 stays as one term, 1940:1945 not ? - assertAnalyzesTo( - fa, - "33Bis 1940-1945 1940:1945 (---i+++)*", - new String[]{"33bis", "1940-1945", "1940", "1945", "i" }); + // Method copied from TestAnalyzers, maybe should be refactored + public void assertAnalyzesTo(Analyzer a, String input, String[] output) + throws Exception { + + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + + for (int i = 0; i < output.length; i++) { + Token t = ts.next(); + assertNotNull(t); + assertEquals(t.termText(), output[i]); + } + assertNull(ts.next()); + ts.close(); + } + + public void testAnalyzer() throws Exception { + FrenchAnalyzer fa = new FrenchAnalyzer(); + + // test null reader + boolean iaeFlag = false; + try { + TokenStream ts = fa.tokenStream("dummy", null); + } catch (IllegalArgumentException iae) { + iaeFlag = true; + } + assertEquals(iaeFlag, true); + + // test null fieldname + iaeFlag = false; + try { + TokenStream ts = fa.tokenStream(null, new StringReader("dummy")); + } catch (IllegalArgumentException iae) { + iaeFlag = true; + } + assertEquals(iaeFlag, true); + + assertAnalyzesTo(fa, "", new String[] { + }); + + assertAnalyzesTo( + fa, + "chien chat cheval", + new String[] { "chien", "chat", "cheval" }); + + assertAnalyzesTo( + fa, + "chien CHAT CHEVAL", + new String[] { "chien", "chat", "cheval" }); + + assertAnalyzesTo( + fa, + " chien ,? + = - CHAT /: > CHEVAL", + new String[] { "chien", "chat", "cheval" }); + + assertAnalyzesTo(fa, "chien++", new String[] { "chien" }); + + assertAnalyzesTo( + fa, + "mot \"entreguillemet\"", + new String[] { "mot", "entreguillemet" }); + + // let's do some french specific tests now + + /* 1. couldn't resist + I would expect this to stay one term as in French the minus + sign is often used for composing words */ + assertAnalyzesTo( + fa, + "Jean-François", + new String[] { "jean", "françois" }); + + // 2. stopwords + assertAnalyzesTo( + fa, + "le la chien les aux chat du des à cheval", + new String[] { "chien", "chat", "cheval" }); + + // some nouns and adjectives + assertAnalyzesTo( + fa, + "lances chismes habitable chiste éléments captifs", + new String[] { + "lanc", + "chism", + "habit", + "chist", + "élément", + "captif" }); + + // some verbs + assertAnalyzesTo( + fa, + "finissions souffrirent rugissante", + new String[] { "fin", "souffr", "rug" }); + + // some everything else + // aujourd'hui stays one term which is OK + assertAnalyzesTo( + fa, + "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ", + new String[] { + "c3po", + "aujourd'hui", + "oeuf", + "ïâöûàä", + "anticonstitutionnel", + "jav" }); + + // some more everything else + // here 1940-1945 stays as one term, 1940:1945 not ? + assertAnalyzesTo( + fa, + "33Bis 1940-1945 1940:1945 (---i+++)*", + new String[] { "33bis", "1940-1945", "1940", "1945", "i" }); - } + } - } +}