/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.search.suggest.phrase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.opensearch.search.suggest.phrase.NoisyChannelSpellChecker.Result;
import org.opensearch.test.OpenSearchTestCase;
import java.io.CharArrayReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import static org.opensearch.search.suggest.phrase.NoisyChannelSpellChecker.DEFAULT_TOKEN_LIMIT;
import static org.opensearch.search.suggest.phrase.NoisyChannelSpellChecker.REAL_WORD_LIKELIHOOD;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
public class NoisyChannelSpellCheckerTests extends OpenSearchTestCase {
private final BytesRef space = new BytesRef(" ");
private final BytesRef preTag = new BytesRef("");
private final BytesRef postTag = new BytesRef("");
public void testNgram() throws IOException {
Directory dir = new ByteBuffersDirectory();
Map mapping = new HashMap<>();
mapping.put("body_ngram", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
ShingleFilter tf = new ShingleFilter(t, 2, 3);
tf.setOutputUnigrams(false);
return new TokenStreamComponents(t, new LowerCaseFilter(tf));
}
});
mapping.put("body", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new LowerCaseFilter(t));
}
});
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
IndexWriter writer = new IndexWriter(dir, conf);
String[] strings = new String[] {
"Xorr the God-Jewel",
"Grog the God-Crusher",
"Xorn",
"Walter Newell",
"Wanda Maximoff",
"Captain America",
"American Ace",
"USA Hero",
"Wundarr the Aquarian",
"Will o' the Wisp",
"Xemnu the Titan",
"Fantastic Four",
"Quasar",
"Quasar II" };
for (String line : strings) {
Document doc = new Document();
doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
}
DirectoryReader ir = DirectoryReader.open(writer);
WordScorer wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT);
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator generator = new DirectCandidateGenerator(
spellchecker,
"body",
SuggestMode.SUGGEST_MORE_POPULAR,
ir,
0.95,
5
);
Result result = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
Correction[] corrections = result.corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ace"));
assertThat(result.cutoffScore, greaterThan(0d));
result = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1);
corrections = result.corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame"));
assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));
suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT);
wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Xor the Got-Jewel"),
generator,
0.5f,
4,
ir,
"body",
wordScorer,
0,
2
).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xorr the got jewel"));
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Xor the Got-Jewel"),
generator,
0.5f,
4,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
// Test some of the highlighting corner cases
suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT);
wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Xor teh Got-Jewel"),
generator,
4f,
4,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(space).utf8ToString(), equalTo("xor teh god jewel"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xor teh god jewel"));
// test synonyms
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
TokenFilter filter = new LowerCaseFilter(t);
try {
SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer());
parser.parse(new StringReader("usa => usa, america, american"));
filter = new SynonymFilter(filter, parser.build(), true);
} catch (Exception e) {
throw new RuntimeException(e);
}
return new TokenStreamComponents(t, filter);
}
};
spellchecker.setAccuracy(0.0f);
spellchecker.setMinPrefix(1);
spellchecker.setMinQueryLength(1);
suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT);
wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = getCorrections(
suggester,
analyzer,
new BytesRef("captian usa"),
generator,
2,
4,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america"));
generator = new DirectCandidateGenerator(
spellchecker,
"body",
SuggestMode.SUGGEST_MORE_POPULAR,
ir,
0.85,
10,
null,
analyzer,
MultiTerms.getTerms(ir, "body")
);
corrections = getCorrections(
suggester,
analyzer,
new BytesRef("captian usw"),
generator,
2,
4,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america"));
// Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
generator = new DirectCandidateGenerator(
spellchecker,
"body",
SuggestMode.SUGGEST_MORE_POPULAR,
ir,
0.85,
10,
null,
analyzer,
MultiTerms.getTerms(ir, "body")
);
corrections = getCorrections(
suggester,
analyzer,
new BytesRef("captain usw"),
generator,
2,
4,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america"));
}
public void testMultiGenerator() throws IOException {
Directory dir = new ByteBuffersDirectory();
Map mapping = new HashMap<>();
mapping.put("body_ngram", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
ShingleFilter tf = new ShingleFilter(t, 2, 3);
tf.setOutputUnigrams(false);
return new TokenStreamComponents(t, new LowerCaseFilter(tf));
}
});
mapping.put("body", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new LowerCaseFilter(t));
}
});
mapping.put("body_reverse", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new ReverseStringFilter(new LowerCaseFilter(t)));
}
});
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
IndexWriter writer = new IndexWriter(dir, conf);
String[] strings = new String[] {
"Xorr the God-Jewel",
"Grog the God-Crusher",
"Xorn",
"Walter Newell",
"Wanda Maximoff",
"Captain America",
"American Ace",
"Wundarr the Aquarian",
"Will o' the Wisp",
"Xemnu the Titan",
"Fantastic Four",
"Quasar",
"Quasar II" };
for (String line : strings) {
Document doc = new Document();
doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
}
DirectoryReader ir = DirectoryReader.open(writer);
LaplaceScorer wordScorer = new LaplaceScorer(
ir,
MultiTerms.getTerms(ir, "body_ngram"),
"body_ngram",
0.95d,
new BytesRef(" "),
0.5f
);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT);
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
DirectCandidateGenerator reverse = new DirectCandidateGenerator(
spellchecker,
"body_reverse",
SuggestMode.SUGGEST_ALWAYS,
ir,
0.95,
10,
wrapper,
wrapper,
MultiTerms.getTerms(ir, "body_reverse")
);
CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
Correction[] corrections = getCorrections(
suggester,
wrapper,
new BytesRef("american cae"),
generator,
1,
1,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("american ame"),
generator,
1,
1,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("american cae"),
forward,
1,
1,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("america cae"),
generator,
2,
1,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Zorr the Got-Jewel"),
generator,
0.5f,
4,
ir,
"body",
wordScorer,
0,
2
).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Zorr the Got-Jewel"),
generator,
0.5f,
1,
ir,
"body",
wordScorer,
1.5f,
2
).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Xor the Got-Jewel"),
generator,
0.5f,
1,
ir,
"body",
wordScorer,
1.5f,
2
).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
// Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Quazar II"),
generator,
1,
1,
ir,
"body",
wordScorer,
1,
2
).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
}
public void testTrigram() throws IOException {
Directory dir = new ByteBuffersDirectory();
Map mapping = new HashMap<>();
mapping.put("body_ngram", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
ShingleFilter tf = new ShingleFilter(t, 2, 3);
tf.setOutputUnigrams(false);
return new TokenStreamComponents(t, new LowerCaseFilter(tf));
}
});
mapping.put("body", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new LowerCaseFilter(t));
}
});
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
IndexWriter writer = new IndexWriter(dir, conf);
String[] strings = new String[] {
"Xorr the God-Jewel",
"Grog the God-Crusher",
"Xorn",
"Walter Newell",
"Wanda Maximoff",
"Captain America",
"American Ace",
"USA Hero",
"Wundarr the Aquarian",
"Will o' the Wisp",
"Xemnu the Titan",
"Fantastic Four",
"Quasar",
"Quasar II" };
for (String line : strings) {
Document doc = new Document();
doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
}
DirectoryReader ir = DirectoryReader.open(writer);
WordScorer wordScorer = new LinearInterpolatingScorer(
ir,
MultiTerms.getTerms(ir, "body_ngram"),
"body_ngram",
0.85d,
new BytesRef(" "),
0.5,
0.4,
0.1
);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT);
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator generator = new DirectCandidateGenerator(
spellchecker,
"body",
SuggestMode.SUGGEST_MORE_POPULAR,
ir,
0.95,
5
);
Correction[] corrections = getCorrections(
suggester,
wrapper,
new BytesRef("american ame"),
generator,
1,
1,
ir,
"body",
wordScorer,
1,
3
).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("american ame"),
generator,
1,
1,
ir,
"body",
wordScorer,
1,
1
).corrections;
assertThat(corrections.length, equalTo(0));
// assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));
wordScorer = new LinearInterpolatingScorer(
ir,
MultiTerms.getTerms(ir, "body_ngram"),
"body_ngram",
0.85d,
new BytesRef(" "),
0.5,
0.4,
0.1
);
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Xor the Got-Jewel"),
generator,
0.5f,
4,
ir,
"body",
wordScorer,
0,
3
).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Xor the Got-Jewel"),
generator,
0.5f,
4,
ir,
"body",
wordScorer,
1,
3
).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Xor the Got-Jewel"),
generator,
0.5f,
1,
ir,
"body",
wordScorer,
100,
3
).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
// test synonyms
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
TokenFilter filter = new LowerCaseFilter(t);
try {
SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer());
parser.parse(new StringReader("usa => usa, america, american"));
filter = new SynonymFilter(filter, parser.build(), true);
} catch (Exception e) {
throw new RuntimeException(e);
}
return new TokenStreamComponents(t, filter);
}
};
spellchecker.setAccuracy(0.0f);
spellchecker.setMinPrefix(1);
spellchecker.setMinQueryLength(1);
suggester = new NoisyChannelSpellChecker(0.95, true, DEFAULT_TOKEN_LIMIT);
wordScorer = new LinearInterpolatingScorer(
ir,
MultiTerms.getTerms(ir, "body_ngram"),
"body_ngram",
0.95d,
new BytesRef(" "),
0.5,
0.4,
0.1
);
corrections = getCorrections(
suggester,
analyzer,
new BytesRef("captian usa"),
generator,
2,
4,
ir,
"body",
wordScorer,
1,
3
).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
generator = new DirectCandidateGenerator(
spellchecker,
"body",
SuggestMode.SUGGEST_MORE_POPULAR,
ir,
0.95,
10,
null,
analyzer,
MultiTerms.getTerms(ir, "body")
);
corrections = getCorrections(
suggester,
analyzer,
new BytesRef("captian usw"),
generator,
2,
4,
ir,
"body",
wordScorer,
1,
3
).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
wordScorer = new StupidBackoffScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4);
corrections = getCorrections(
suggester,
wrapper,
new BytesRef("Xor the Got-Jewel"),
generator,
0.5f,
2,
ir,
"body",
wordScorer,
0,
3
).corrections;
assertThat(corrections.length, equalTo(2));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
}
public void testFewDocsEgdeCase() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig())) {
Document document = new Document();
document.add(new TextField("field", "value", Field.Store.NO));
iw.addDocument(document);
iw.commit();
document = new Document();
document.add(new TextField("other_field", "value", Field.Store.NO));
iw.addDocument(document);
}
try (DirectoryReader ir = DirectoryReader.open(dir)) {
WordScorer wordScorer = new StupidBackoffScorer(
ir,
MultiTerms.getTerms(ir, "field"),
"field",
0.95d,
new BytesRef(" "),
0.4f
);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT);
DirectSpellChecker spellchecker = new DirectSpellChecker();
DirectCandidateGenerator generator = new DirectCandidateGenerator(
spellchecker,
"field",
SuggestMode.SUGGEST_MORE_POPULAR,
ir,
0.95,
5
);
Result result = getCorrections(
suggester,
new StandardAnalyzer(),
new BytesRef("valeu"),
generator,
1,
1,
ir,
"field",
wordScorer,
1,
2
);
assertThat(result.corrections.length, equalTo(1));
assertThat(result.corrections[0].join(space).utf8ToString(), equalTo("value"));
}
}
}
private Result getCorrections(
NoisyChannelSpellChecker checker,
Analyzer analyzer,
BytesRef query,
CandidateGenerator generator,
float maxErrors,
int numCorrections,
IndexReader reader,
String analysisField,
WordScorer scorer,
float confidence,
int gramSize
) throws IOException {
CharsRefBuilder spare = new CharsRefBuilder();
spare.copyUTF8Bytes(query);
TokenStream tokenStream = analyzer.tokenStream(analysisField, new CharArrayReader(spare.chars(), 0, spare.length()));
return checker.getCorrections(tokenStream, generator, maxErrors, numCorrections, scorer, confidence, gramSize);
}
}