Index: oak-lucene-mt/pom.xml
===================================================================
--- oak-lucene-mt/pom.xml (nonexistent)
+++ oak-lucene-mt/pom.xml (working copy)
@@ -0,0 +1,121 @@
+
+
+
+
+
+ 4.0.0
+
+
+ org.apache.jackrabbit
+ oak-parent
+ 1.8-SNAPSHOT
+ ../oak-parent/pom.xml
+
+
+ oak-lucene-mt
+ Oak Lucene Machine Translation
+ bundle
+ Machine Translation extension for Oak Lucene
+
+
+
+
+ org.apache.felix
+ maven-bundle-plugin
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+
+
+
+
+
+ org.osgi
+ org.osgi.core
+ provided
+
+
+ org.osgi
+ org.osgi.compendium
+ provided
+
+
+ org.osgi
+ org.osgi.annotation
+ provided
+
+
+ org.apache.felix
+ org.apache.felix.scr.annotations
+ provided
+
+
+
+ org.apache.jackrabbit
+ oak-lucene
+ ${project.version}
+
+
+ org.apache.joshua
+ joshua-incubating
+ 6.1
+
+
+
+
+ org.slf4j
+ slf4j-api
+
+
+
+
+ com.google.code.findbugs
+ jsr305
+
+
+
+
+ junit
+ junit
+ test
+
+
+ org.mockito
+ mockito-core
+ 1.10.19
+ test
+
+
+
+
Property changes on: oak-lucene-mt/pom.xml
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProvider.java
===================================================================
--- oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProvider.java (nonexistent)
+++ oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProvider.java (working copy)
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.spi.mt;
+
+import javax.annotation.Nonnull;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames;
+import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.StructuredTranslation;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * {@link FulltextQueryTermsProvider} that performs machine translation on full text returning a query containing
+ * translated tokens.
+ */
+public class MTFulltextQueryTermsProvider implements FulltextQueryTermsProvider {
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+ private final Decoder decoder;
+ private final Set nodeTypes;
+ private final float minScore;
+
+ public MTFulltextQueryTermsProvider(Decoder decoder, Set nodeTypes, float minScore) {
+ this.decoder = decoder;
+ this.nodeTypes = nodeTypes;
+ this.minScore = minScore;
+ }
+
+ @Override
+ public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) {
+ BooleanQuery query = new BooleanQuery();
+ Sentence sentence = new Sentence(text, 0, decoder.getJoshuaConfiguration());
+ Translation translation = decoder.decode(sentence);
+ log.debug("{} decoded into {}", text, translation);
+ // try phrase translation first
+ List structuredTranslations = translation.getStructuredTranslations();
+ if (!structuredTranslations.isEmpty()) {
+ addTranslations(query, structuredTranslations);
+ } else {
+ // if phrase cannot be translated, perform token by token translation
+ try {
+ TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
+ tokenStream.addAttribute(CharTermAttribute.class);
+ tokenStream.reset();
+ while (tokenStream.incrementToken()) {
+ CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
+ Translation translatedToken = decoder.decode(new Sentence(attribute.toString(), 0,
+ decoder.getJoshuaConfiguration()));
+ addTranslations(query, translatedToken.getStructuredTranslations());
+ }
+ tokenStream.end();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+ return query.clauses().size() > 0 ? query : null;
+ }
+
+ private void addTranslations(BooleanQuery query, List structuredTranslations) {
+ for (StructuredTranslation st : structuredTranslations) {
+ String translationString = st.getTranslationString();
+ if (st.getTranslationScore() > minScore) {
+ query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translationString)),
+ BooleanClause.Occur.SHOULD));
+ log.debug("added query for translated phrase {}", translationString);
+ List translationTokens = st.getTranslationTokens();
+ int i = 0;
+ // if output is a phrase, look for tokens having a word alignment to the original sentence terms
+ for (List wa : st.getTranslationWordAlignments()) {
+ if (!wa.isEmpty()) {
+ String translatedTerm = translationTokens.get(i);
+ log.debug("added query for translated token {}", translatedTerm);
+ query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translatedTerm)),
+ BooleanClause.Occur.SHOULD));
+ }
+ i++;
+ }
+ }
+ }
+ }
+
+ public void clearResources() {
+ decoder.cleanUp();
+ }
+
+ @Nonnull
+ @Override
+ public Set getSupportedTypes() {
+ return nodeTypes;
+ }
+}
Property changes on: oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProvider.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderFactory.java
===================================================================
--- oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderFactory.java (nonexistent)
+++ oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderFactory.java (working copy)
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.spi.mt;
+
+import javax.annotation.Nonnull;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.jackrabbit.oak.commons.PropertiesUtil;
+import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.apache.joshua.decoder.Decoder;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.Query;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Factory for {@link MTFulltextQueryTermsProvider}
+ */
+@Component(
+ name = "org.apache.jackrabbit.oak.plugins.index.lucene.mt.MTFulltextQueryTermsProviderFactory",
+ label = "Apache Jackrabbit Oak Machine Translation Fulltext Query Terms Provider",
+ configurationFactory = true,
+ metatype = true,
+ policy = ConfigurationPolicy.REQUIRE
+)
+@Service(FulltextQueryTermsProvider.class)
+public class MTFulltextQueryTermsProviderFactory implements FulltextQueryTermsProvider {
+
+ private static final float DEFAULT_MIN_SCORE = 0.5f;
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+ @Property(label = "Joshua Config Path", description = "The absolute filesystem path to Apache Joshua configuration file")
+ private static final String CONFIG_PATH = "path.to.config";
+
+ @Property(label = "Node types", description = "List of node types for which expanding the query via MT", cardinality = 10)
+ private static final String NODE_TYPES = "node.types";
+
+ @Property(label = "Minimum score", description = "Minimum allowed score for a translated phrase/term to be used for expansion",
+ floatValue = DEFAULT_MIN_SCORE)
+ private static final String MIN_SCORE = "min.score";
+
+ private MTFulltextQueryTermsProvider queryTermsProvider;
+
+ @Activate
+ public void activate(Map config) throws Exception {
+ String pathToJoshuaConfig = PropertiesUtil.toString(config.get(CONFIG_PATH), ".");
+ String[] nts = PropertiesUtil.toStringArray(config.get(NODE_TYPES), new String[]{"Oak:unstructured"});
+ float minScore = (float) PropertiesUtil.toDouble(config.get(MIN_SCORE), DEFAULT_MIN_SCORE);
+ log.info("activating MT FulltextQueryTermProvider from Joshua config at {} on {} nodetypes", pathToJoshuaConfig, nts);
+ try {
+ log.debug("parsing joshua config file");
+ Decoder decoder = Decoder.createDecoder(pathToJoshuaConfig);
+ decoder.getJoshuaConfiguration().use_structured_output = true;
+ decoder.getJoshuaConfiguration().sanityCheck();
+ log.debug("decoder initialized");
+ Set nodeTypes = new HashSet<>();
+ nodeTypes.addAll(Arrays.asList(nts));
+ queryTermsProvider = new MTFulltextQueryTermsProvider(decoder, nodeTypes, minScore);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Deactivate
+ public void deactivate() throws Exception {
+ log.info("clearing resources");
+ queryTermsProvider.clearResources();
+ }
+
+ @Override
+ public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) {
+ return queryTermsProvider.getQueryTerm(text, analyzer, indexDefinition);
+ }
+
+ @Nonnull
+ @Override
+ public Set getSupportedTypes() {
+ return queryTermsProvider.getSupportedTypes();
+ }
+}
Property changes on: oak-lucene-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderFactory.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: oak-lucene-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderTest.java
===================================================================
--- oak-lucene-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderTest.java (nonexistent)
+++ oak-lucene-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderTest.java (working copy)
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.spi.mt;
+
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.StructuredTranslation;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.lucene.analysis.Analyzer;
+import org.junit.Test;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+/**
+ * Tests for {@link MTFulltextQueryTermsProvider}
+ */
+public class MTFulltextQueryTermsProviderTest {
+
+ @Test
+ public void testGetQueryTermWithPhraseTranslation() throws Exception {
+ Decoder decoder = mock(Decoder.class);
+ Translation translation = mock(Translation.class);
+ List translations = new LinkedList<>();
+ StructuredTranslation structuredTranslation = mock(StructuredTranslation.class);
+ when(structuredTranslation.getTranslationString()).thenReturn("fou bur");
+ translations.add(structuredTranslation);
+ when(translation.getStructuredTranslations()).thenReturn(translations);
+ when(decoder.decode(any(Sentence.class))).thenReturn(translation);
+ JoshuaConfiguration configuration = mock(JoshuaConfiguration.class);
+ when(decoder.getJoshuaConfiguration()).thenReturn(configuration);
+ Set nodeTypes = new HashSet<>();
+ MTFulltextQueryTermsProvider mtFulltextQueryTermsProvider = new MTFulltextQueryTermsProvider(decoder, nodeTypes, -1);
+ Analyzer analyzer = mock(Analyzer.class);
+ NodeState indexDefinition = mock(NodeState.class);
+ mtFulltextQueryTermsProvider.getQueryTerm("foo bar", analyzer, indexDefinition);
+ }
+}
\ No newline at end of file
Property changes on: oak-lucene-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/spi/mt/MTFulltextQueryTermsProviderTest.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property