Index: oak-lucene-mt/pom.xml =================================================================== --- oak-lucene-mt/pom.xml (nonexistent) +++ oak-lucene-mt/pom.xml (working copy) @@ -0,0 +1,121 @@ + + + + + + 4.0.0 + + + org.apache.jackrabbit + oak-parent + 1.8-SNAPSHOT + ../oak-parent/pom.xml + + + oak-lucene-mt + Oak Lucene Machine Translation + bundle + Machine Translation extension for Oak Lucene + + + + + org.apache.felix + maven-bundle-plugin + + + + + + + + + + + + + + + + org.apache.rat + apache-rat-plugin + + + + + + + + org.osgi + org.osgi.core + provided + + + org.osgi + org.osgi.compendium + provided + + + org.osgi + org.osgi.annotation + provided + + + org.apache.felix + org.apache.felix.scr.annotations + provided + + + + org.apache.jackrabbit + oak-lucene + ${project.version} + + + org.apache.joshua + joshua-incubating + 6.1 + + + + + org.slf4j + slf4j-api + + + + + com.google.code.findbugs + jsr305 + + + + + junit + junit + test + + + org.mockito + mockito-core + 1.10.19 + test + + + + Property changes on: oak-lucene-mt/pom.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProvider.java =================================================================== --- oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProvider.java (nonexistent) +++ oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProvider.java (working copy) @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.plugins.index.lucene.spi.mt; + +import javax.annotation.Nonnull; +import java.io.IOException; +import java.io.StringReader; +import java.util.List; +import java.util.Set; + +import org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames; +import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider; +import org.apache.jackrabbit.oak.spi.state.NodeState; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.StructuredTranslation; +import org.apache.joshua.decoder.Translation; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link FulltextQueryTermsProvider} that performs machine translation on full text returning a query containing + * translated tokens. + */ +public class MTFulltextQueryTermsProvider implements FulltextQueryTermsProvider { + + private final Logger log = LoggerFactory.getLogger(getClass()); + + private final Decoder decoder; + private final Set nodeTypes; + private final float minScore; + + public MTFulltextQueryTermsProvider(Decoder decoder, Set nodeTypes, float minScore) { + this.decoder = decoder; + this.nodeTypes = nodeTypes; + this.minScore = minScore; + } + + @Override + public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) { + BooleanQuery query = new BooleanQuery(); + Sentence sentence = new Sentence(text, 0, decoder.getJoshuaConfiguration()); + Translation translation = decoder.decode(sentence); + log.debug("{} decoded into {}", text, translation); + // try phrase translation first + List structuredTranslations = translation.getStructuredTranslations(); + if (!structuredTranslations.isEmpty()) { + addTranslations(query, structuredTranslations); + } else { + // if phrase cannot be translated, perform token by token translation + try { + TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); + tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); + Translation translatedToken = decoder.decode(new Sentence(attribute.toString(), 0, + decoder.getJoshuaConfiguration())); + addTranslations(query, translatedToken.getStructuredTranslations()); + } + tokenStream.end(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + } + return query.clauses().size() > 0 ? query : null; + } + + private void addTranslations(BooleanQuery query, List structuredTranslations) { + for (StructuredTranslation st : structuredTranslations) { + String translationString = st.getTranslationString(); + if (st.getTranslationScore() > minScore) { + query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translationString)), + BooleanClause.Occur.SHOULD)); + log.debug("added query for translated phrase {}", translationString); + List translationTokens = st.getTranslationTokens(); + int i = 0; + // if output is a phrase, look for tokens having a word alignment to the original sentence terms + for (List wa : st.getTranslationWordAlignments()) { + if (!wa.isEmpty()) { + String translatedTerm = translationTokens.get(i); + log.debug("added query for translated token {}", translatedTerm); + query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translatedTerm)), + BooleanClause.Occur.SHOULD)); + } + i++; + } + } + } + } + + public void clearResources() { + decoder.cleanUp(); + } + + @Nonnull + @Override + public Set getSupportedTypes() { + return nodeTypes; + } +} Property changes on: oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProvider.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderFactory.java =================================================================== --- oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderFactory.java (nonexistent) +++ oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderFactory.java (working copy) @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.plugins.index.lucene.spi.mt; + +import javax.annotation.Nonnull; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.felix.scr.annotations.Activate; +import org.apache.felix.scr.annotations.Component; +import org.apache.felix.scr.annotations.ConfigurationPolicy; +import org.apache.felix.scr.annotations.Deactivate; +import org.apache.felix.scr.annotations.Property; +import org.apache.felix.scr.annotations.Service; +import org.apache.jackrabbit.oak.commons.PropertiesUtil; +import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider; +import org.apache.jackrabbit.oak.spi.state.NodeState; +import org.apache.joshua.decoder.Decoder; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.Query; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Factory for {@link MTFulltextQueryTermsProvider} + */ +@Component( + name = "org.apache.jackrabbit.oak.plugins.index.lucene.mt.MTFulltextQueryTermsProviderFactory", + label = "Apache Jackrabbit Oak Machine Translation Fulltext Query Terms Provider", + configurationFactory = true, + metatype = true, + policy = ConfigurationPolicy.REQUIRE +) +@Service(FulltextQueryTermsProvider.class) +public class MTFulltextQueryTermsProviderFactory implements FulltextQueryTermsProvider { + + private static final float DEFAULT_MIN_SCORE = 0.5f; + + private final Logger log = LoggerFactory.getLogger(getClass()); + + @Property(label = "Joshua Config Path", description = "The absolute filesystem path to Apache Joshua configuration file") + private static final String CONFIG_PATH = "path.to.config"; + + @Property(label = "Node types", description = "List of node types for which expanding the query via MT", cardinality = 10) + private static final String NODE_TYPES = "node.types"; + + @Property(label = "Minimum score", description = "Minimum allowed score for a translated phrase/term to be used for expansion", + floatValue = DEFAULT_MIN_SCORE) + private static final String MIN_SCORE = "min.score"; + + private MTFulltextQueryTermsProvider queryTermsProvider; + + @Activate + public void activate(Map config) throws Exception { + String pathToJoshuaConfig = PropertiesUtil.toString(config.get(CONFIG_PATH), "."); + String[] nts = PropertiesUtil.toStringArray(config.get(NODE_TYPES), new String[]{"Oak:unstructured"}); + float minScore = (float) PropertiesUtil.toDouble(config.get(MIN_SCORE), DEFAULT_MIN_SCORE); + log.info("activating MT FulltextQueryTermProvider from Joshua config at {} on {} nodetypes", pathToJoshuaConfig, nts); + try { + log.debug("parsing joshua config file"); + Decoder decoder = Decoder.createDecoder(pathToJoshuaConfig); + decoder.getJoshuaConfiguration().use_structured_output = true; + decoder.getJoshuaConfiguration().sanityCheck(); + log.debug("decoder initialized"); + Set nodeTypes = new HashSet<>(); + nodeTypes.addAll(Arrays.asList(nts)); + queryTermsProvider = new MTFulltextQueryTermsProvider(decoder, nodeTypes, minScore); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Deactivate + public void deactivate() throws Exception { + log.info("clearing resources"); + queryTermsProvider.clearResources(); + } + + @Override + public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) { + return queryTermsProvider.getQueryTerm(text, analyzer, indexDefinition); + } + + @Nonnull + @Override + public Set getSupportedTypes() { + return queryTermsProvider.getSupportedTypes(); + } +} Property changes on: oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: oak-lucene-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderTest.java =================================================================== --- oak-lucene-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderTest.java (nonexistent) +++ oak-lucene-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderTest.java (working copy) @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.plugins.index.lucene.spi.mt; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.jackrabbit.oak.spi.state.NodeState; +import org.apache.joshua.decoder.Decoder; +import org.apache.joshua.decoder.JoshuaConfiguration; +import org.apache.joshua.decoder.StructuredTranslation; +import org.apache.joshua.decoder.Translation; +import org.apache.joshua.decoder.segment_file.Sentence; +import org.apache.lucene.analysis.Analyzer; +import org.junit.Test; + +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +/** + * Tests for {@link MTFulltextQueryTermsProvider} + */ +public class MTFulltextQueryTermsProviderTest { + + @Test + public void testGetQueryTermWithPhraseTranslation() throws Exception { + Decoder decoder = mock(Decoder.class); + Translation translation = mock(Translation.class); + List translations = new LinkedList<>(); + StructuredTranslation structuredTranslation = mock(StructuredTranslation.class); + when(structuredTranslation.getTranslationString()).thenReturn("fou bur"); + translations.add(structuredTranslation); + when(translation.getStructuredTranslations()).thenReturn(translations); + when(decoder.decode(any(Sentence.class))).thenReturn(translation); + JoshuaConfiguration configuration = mock(JoshuaConfiguration.class); + when(decoder.getJoshuaConfiguration()).thenReturn(configuration); + Set nodeTypes = new HashSet<>(); + MTFulltextQueryTermsProvider mtFulltextQueryTermsProvider = new MTFulltextQueryTermsProvider(decoder, nodeTypes, -1); + Analyzer analyzer = mock(Analyzer.class); + NodeState indexDefinition = mock(NodeState.class); + mtFulltextQueryTermsProvider.getQueryTerm("foo bar", analyzer, indexDefinition); + } +} \ No newline at end of file Property changes on: oak-lucene-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderTest.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property