/* * SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. */ package org.opensearch.index.query; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.queries.intervals.IntervalMatchesIterator; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.queries.intervals.IntervalIterator; import org.apache.lucene.queries.intervals.Intervals; import org.apache.lucene.queries.intervals.IntervalsSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; /** * Constructs an IntervalsSource based on analyzed text * * @opensearch.internal */ public class IntervalBuilder { private final String field; private final Analyzer analyzer; public IntervalBuilder(String field, Analyzer analyzer) { this.field = field; this.analyzer = analyzer; } public IntervalsSource analyzeText(String query, int maxGaps, boolean ordered) throws IOException { return analyzeText(query, maxGaps, ordered ? IntervalMode.ORDERED : IntervalMode.UNORDERED); } public IntervalsSource analyzeText(String query, int maxGaps, IntervalMode mode) throws IOException { try (TokenStream ts = analyzer.tokenStream(field, query); CachingTokenFilter stream = new CachingTokenFilter(ts)) { return analyzeText(stream, maxGaps, mode); } } protected IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, boolean ordered) throws IOException { return analyzeText(stream, maxGaps, ordered ? IntervalMode.ORDERED : IntervalMode.UNORDERED); } protected IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, IntervalMode mode) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class); if (termAtt == null) { return NO_INTERVALS; } // phase 1: read through the stream and assess the situation: // counting the number of tokens/positions and marking if we have any synonyms. int numTokens = 0; boolean hasSynonyms = false; boolean isGraph = false; stream.reset(); while (stream.incrementToken()) { numTokens++; int positionIncrement = posIncAtt.getPositionIncrement(); if (positionIncrement == 0) { hasSynonyms = true; } int positionLength = posLenAtt.getPositionLength(); if (positionLength > 1) { isGraph = true; } } // phase 2: based on token count, presence of synonyms, and options // formulate a single term, boolean, or phrase. if (numTokens == 0) { return NO_INTERVALS; } else if (numTokens == 1) { // single term return analyzeTerm(stream); } else if (isGraph) { // graph return combineSources(analyzeGraph(stream), maxGaps, mode); } else { // phrase if (hasSynonyms) { // phrase with single-term synonyms return analyzeSynonyms(stream, maxGaps, mode); } else { // simple phrase return combineSources(analyzeTerms(stream), maxGaps, mode); } } } protected IntervalsSource analyzeTerm(TokenStream ts) throws IOException { TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); ts.reset(); ts.incrementToken(); return Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())); } protected static IntervalsSource combineSources(List sources, int maxGaps, IntervalMode mode) { if (sources.size() == 0) { return NO_INTERVALS; } if (sources.size() == 1) { return sources.get(0); } IntervalsSource[] sourcesArray = sources.toArray(new IntervalsSource[0]); if (maxGaps == 0 && mode == IntervalMode.ORDERED) { return Intervals.phrase(sourcesArray); } IntervalsSource inner; if (mode == IntervalMode.ORDERED) { inner = Intervals.ordered(sourcesArray); } else if (mode == IntervalMode.UNORDERED) { inner = Intervals.unordered(sourcesArray); } else { inner = Intervals.unorderedNoOverlaps(sourcesArray[0], sourcesArray[1]); for (int sourceIdx = 2; sourceIdx < sourcesArray.length; sourceIdx++) { inner = Intervals.unorderedNoOverlaps(maxGaps == -1 ? inner : Intervals.maxgaps(maxGaps, inner), sourcesArray[sourceIdx]); } } if (maxGaps == -1) { return inner; } return Intervals.maxgaps(maxGaps, inner); } protected List analyzeTerms(TokenStream ts) throws IOException { List terms = new ArrayList<>(); TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { BytesRef term = bytesAtt.getBytesRef(); int precedingSpaces = posAtt.getPositionIncrement() - 1; terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces)); } ts.end(); return terms; } public static IntervalsSource extend(IntervalsSource source, int precedingSpaces) { if (precedingSpaces == 0) { return source; } return Intervals.extend(source, precedingSpaces, 0); } protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, IntervalMode mode) throws IOException { List terms = new ArrayList<>(); List synonyms = new ArrayList<>(); TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); int spaces = 0; while (ts.incrementToken()) { int posInc = posAtt.getPositionIncrement(); if (posInc > 0) { if (synonyms.size() == 1) { terms.add(extend(synonyms.get(0), spaces)); } else if (synonyms.size() > 1) { terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces)); } synonyms.clear(); spaces = posInc - 1; } synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef()))); } if (synonyms.size() == 1) { terms.add(extend(synonyms.get(0), spaces)); } else { terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces)); } return combineSources(terms, maxGaps, mode); } protected List analyzeGraph(TokenStream source) throws IOException { source.reset(); GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source); List clauses = new ArrayList<>(); int[] articulationPoints = graph.articulationPoints(); int lastState = 0; int maxClauseCount = BooleanQuery.getMaxClauseCount(); for (int i = 0; i <= articulationPoints.length; i++) { int start = lastState; int end = -1; if (i < articulationPoints.length) { end = articulationPoints[i]; } lastState = end; if (graph.hasSidePath(start)) { List paths = new ArrayList<>(); Iterator it = graph.getFiniteStrings(start, end); while (it.hasNext()) { TokenStream ts = it.next(); IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, IntervalMode.ORDERED); if (paths.size() >= maxClauseCount) { throw new BooleanQuery.TooManyClauses(); } paths.add(phrase); } if (paths.size() > 0) { clauses.add(Intervals.or(paths.toArray(new IntervalsSource[0]))); } } else { Iterator it = graph.getFiniteStrings(start, end); TokenStream ts = it.next(); clauses.addAll(analyzeTerms(ts)); assert it.hasNext() == false; } } return clauses; } static final IntervalsSource NO_INTERVALS = new IntervalsSource() { @Override public IntervalIterator intervals(String field, LeafReaderContext ctx) { return new IntervalIterator() { @Override public int start() { return NO_MORE_INTERVALS; } @Override public int end() { return NO_MORE_INTERVALS; } @Override public int gaps() { throw new UnsupportedOperationException(); } @Override public int nextInterval() { return NO_MORE_INTERVALS; } @Override public float matchCost() { return 0; } @Override public int docID() { return NO_MORE_DOCS; } @Override public int nextDoc() { return NO_MORE_DOCS; } @Override public int advance(int target) { return NO_MORE_DOCS; } @Override public long cost() { return 0; } }; } @Override public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) { return null; } @Override public void visit(String field, QueryVisitor visitor) {} @Override public int minExtent() { return 0; } @Override public Collection pullUpDisjunctions() { return Collections.emptyList(); } @Override public int hashCode() { return 0; } @Override public boolean equals(Object other) { return other == this; } @Override public String toString() { return "no_match"; } }; }