/* * SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. */ package org.apache.lucene.queries; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.InPlaceMergeSorter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; /** * BlendedTermQuery can be used to unify term statistics across * one or more fields in the index. A common problem with structured * documents is that a term that is significant in on field might not be * significant in other fields like in a scenario where documents represent * users with a "first_name" and a "second_name". When someone searches * for "simon" it will very likely get "paul simon" first since "simon" is a * an uncommon last name ie. has a low document frequency. This query * tries to "lie" about the global statistics like document frequency as well * total term frequency to rank based on the estimated statistics. *
* While aggregating the total term frequency is trivial since it * can be summed up not every {@link org.apache.lucene.search.similarities.Similarity} * makes use of this statistic. The document frequency which is used in the * {@link org.apache.lucene.search.similarities.ClassicSimilarity} * can only be estimated as an lower-bound since it is a document based statistic. For * the document frequency the maximum frequency across all fields per term is used * which is the minimum number of documents the terms occurs in. *
*/ public abstract class BlendedTermQuery extends Query { private final Term[] terms; private final float[] boosts; public BlendedTermQuery(Term[] terms, float[] boosts) { if (terms == null || terms.length == 0) { throw new IllegalArgumentException("terms must not be null or empty"); } if (boosts != null && boosts.length != terms.length) { throw new IllegalArgumentException("boosts must have the same size as terms"); } this.terms = terms; this.boosts = boosts; } @Override public Query rewrite(IndexSearcher searcher) throws IOException { Query rewritten = super.rewrite(searcher); if (rewritten != this) { return rewritten; } IndexReader reader = searcher.getIndexReader(); IndexReaderContext context = reader.getContext(); TermStates[] ctx = new TermStates[terms.length]; int[] docFreqs = new int[ctx.length]; for (int i = 0; i < terms.length; i++) { ctx[i] = TermStates.build(context, terms[i], true); docFreqs[i] = ctx[i].docFreq(); } final int maxDoc = reader.maxDoc(); blend(ctx, maxDoc, reader); return topLevelQuery(terms, ctx, docFreqs, maxDoc); } protected abstract Query topLevelQuery(Term[] terms, TermStates[] ctx, int[] docFreqs, int maxDoc); protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader) throws IOException { if (contexts.length <= 1) { return; } int max = 0; long minSumTTF = Long.MAX_VALUE; for (int i = 0; i < contexts.length; i++) { TermStates ctx = contexts[i]; int df = ctx.docFreq(); // we use the max here since it's the only "true" estimation we can make here // at least max(df) documents have that term. Sum or Averages don't seem // to have a significant meaning here. // TODO: Maybe it could also make sense to assume independent distributions of documents and eg. have: // df = df1 + df2 - (df1 * df2 / maxDoc)? max = Math.max(df, max); if (ctx.totalTermFreq() > 0) { // we need to find out the minimum sumTTF to adjust the statistics // otherwise the statistics don't match minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field())); } } if (maxDoc > minSumTTF) { maxDoc = (int) minSumTTF; } if (max == 0) { return; // we are done that term doesn't exist at all } long sumTTF = 0; final int[] tieBreak = new int[contexts.length]; for (int i = 0; i < tieBreak.length; ++i) { tieBreak[i] = i; } new InPlaceMergeSorter() { @Override protected void swap(int i, int j) { final int tmp = tieBreak[i]; tieBreak[i] = tieBreak[j]; tieBreak[j] = tmp; } @Override protected int compare(int i, int j) { return Integer.compare(contexts[tieBreak[j]].docFreq(), contexts[tieBreak[i]].docFreq()); } }.sort(0, tieBreak.length); int prev = contexts[tieBreak[0]].docFreq(); int actualDf = Math.min(maxDoc, max); assert actualDf >= 0 : "DF must be >= 0"; // here we try to add a little bias towards // the more popular (more frequent) fields // that acts as a tie breaker for (int i : tieBreak) { TermStates ctx = contexts[i]; if (ctx.docFreq() == 0) { break; } final int current = ctx.docFreq(); if (prev > current) { actualDf++; } contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf)); prev = current; sumTTF += ctx.totalTermFreq(); } sumTTF = Math.min(sumTTF, minSumTTF); for (int i = 0; i < contexts.length; i++) { int df = contexts[i].docFreq(); if (df == 0) { continue; } contexts[i] = adjustTTF(reader.getContext(), contexts[i], sumTTF); } } private TermStates adjustTTF(IndexReaderContext readerContext, TermStates termContext, long sumTTF) throws IOException { assert termContext.wasBuiltFor(readerContext); TermStates newTermContext = new TermStates(readerContext); List