/* * SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. */ package org.opensearch.indices.analysis; import org.apache.lucene.analysis.LowerCaseFilter; import org.opensearch.Version; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.NamedRegistry; import org.opensearch.common.logging.DeprecationLogger; import org.opensearch.common.settings.Settings; import org.opensearch.env.Environment; import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.index.analysis.AnalyzerProvider; import org.opensearch.index.analysis.CharFilterFactory; import org.opensearch.index.analysis.HunspellTokenFilterFactory; import org.opensearch.index.analysis.KeywordAnalyzerProvider; import org.opensearch.index.analysis.LowercaseNormalizerProvider; import org.opensearch.index.analysis.PreBuiltAnalyzerProviderFactory; import org.opensearch.index.analysis.PreConfiguredCharFilter; import org.opensearch.index.analysis.PreConfiguredTokenFilter; import org.opensearch.index.analysis.PreConfiguredTokenizer; import org.opensearch.index.analysis.ShingleTokenFilterFactory; import org.opensearch.index.analysis.SimpleAnalyzerProvider; import org.opensearch.index.analysis.StandardAnalyzerProvider; import org.opensearch.index.analysis.StandardTokenizerFactory; import org.opensearch.index.analysis.StopAnalyzerProvider; import org.opensearch.index.analysis.StopTokenFilterFactory; import org.opensearch.index.analysis.TokenFilterFactory; import org.opensearch.index.analysis.TokenizerFactory; import org.opensearch.index.analysis.WhitespaceAnalyzerProvider; import org.opensearch.plugins.AnalysisPlugin; import java.io.IOException; import java.util.List; import java.util.Locale; import java.util.Map; import static java.util.Collections.unmodifiableMap; import static org.opensearch.plugins.AnalysisPlugin.requiresAnalysisSettings; /** * Sets up {@link AnalysisRegistry}. * * @opensearch.internal */ public final class AnalysisModule { static { Settings build = Settings.builder() .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) .build(); IndexMetadata metadata = IndexMetadata.builder("_na_").settings(build).build(); NA_INDEX_SETTINGS = new IndexSettings(metadata, Settings.EMPTY); } private static final IndexSettings NA_INDEX_SETTINGS; private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(AnalysisModule.class); private final HunspellService hunspellService; private final AnalysisRegistry analysisRegistry; public AnalysisModule(Environment environment, List plugins) throws IOException { NamedRegistry> charFilters = setupCharFilters(plugins); NamedRegistry hunspellDictionaries = setupHunspellDictionaries(plugins); hunspellService = new HunspellService(environment.settings(), environment, hunspellDictionaries.getRegistry()); NamedRegistry> tokenFilters = setupTokenFilters(plugins, hunspellService); NamedRegistry> tokenizers = setupTokenizers(plugins); NamedRegistry>> analyzers = setupAnalyzers(plugins); NamedRegistry>> normalizers = setupNormalizers(plugins); Map preConfiguredCharFilters = setupPreConfiguredCharFilters(plugins); Map preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins); Map preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins); Map preConfiguredAnalyzers = setupPreBuiltAnalyzerProviderFactories(plugins); analysisRegistry = new AnalysisRegistry( environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers, preConfiguredAnalyzers ); } HunspellService getHunspellService() { return hunspellService; } public AnalysisRegistry getAnalysisRegistry() { return analysisRegistry; } private NamedRegistry> setupCharFilters(List plugins) { NamedRegistry> charFilters = new NamedRegistry<>("char_filter"); charFilters.extractAndRegister(plugins, AnalysisPlugin::getCharFilters); return charFilters; } public NamedRegistry setupHunspellDictionaries(List plugins) { NamedRegistry hunspellDictionaries = new NamedRegistry<>("dictionary"); hunspellDictionaries.extractAndRegister(plugins, AnalysisPlugin::getHunspellDictionaries); return hunspellDictionaries; } private NamedRegistry> setupTokenFilters( List plugins, HunspellService hunspellService ) { NamedRegistry> tokenFilters = new NamedRegistry<>("token_filter"); tokenFilters.register("stop", StopTokenFilterFactory::new); // Add "standard" for old indices (bwc) tokenFilters.register("standard", new AnalysisProvider() { @Override public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) { throw new IllegalArgumentException("The [standard] token filter has been removed."); } @Override public boolean requiresAnalysisSettings() { return false; } }); tokenFilters.register("shingle", ShingleTokenFilterFactory::new); tokenFilters.register( "hunspell", requiresAnalysisSettings( (indexSettings, env, name, settings) -> new HunspellTokenFilterFactory(indexSettings, name, settings, hunspellService) ) ); tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters); return tokenFilters; } static Map setupPreBuiltAnalyzerProviderFactories(List plugins) { NamedRegistry preConfiguredCharFilters = new NamedRegistry<>("pre-built analyzer"); for (AnalysisPlugin plugin : plugins) { for (PreBuiltAnalyzerProviderFactory factory : plugin.getPreBuiltAnalyzerProviderFactories()) { preConfiguredCharFilters.register(factory.getName(), factory); } } return unmodifiableMap(preConfiguredCharFilters.getRegistry()); } static Map setupPreConfiguredCharFilters(List plugins) { NamedRegistry preConfiguredCharFilters = new NamedRegistry<>("pre-configured char_filter"); // No char filter are available in lucene-core so none are built in to OpenSearch core for (AnalysisPlugin plugin : plugins) { for (PreConfiguredCharFilter filter : plugin.getPreConfiguredCharFilters()) { preConfiguredCharFilters.register(filter.getName(), filter); } } return unmodifiableMap(preConfiguredCharFilters.getRegistry()); } static Map setupPreConfiguredTokenFilters(List plugins) { NamedRegistry preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter"); // Add filters available in lucene-core preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new)); // Add "standard" for old indices (bwc) preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.openSearchVersion("standard", true, (reader, version) -> { // This was originally removed in Legacy 7_0_0 but due to a cacheing bug it was still possible // in certain circumstances to create a new index referencing the standard token filter // until legacy version 7_5_2 // todo verify this can be removed in 3.0 throw new IllegalArgumentException("The [standard] token filter has been removed."); })); /* Note that "stop" is available in lucene-core but it's pre-built * version uses a set of English stop words that are in * lucene-analysis-common so "stop" is defined in the analysis-common * module. */ for (AnalysisPlugin plugin : plugins) { for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) { preConfiguredTokenFilters.register(filter.getName(), filter); } } return unmodifiableMap(preConfiguredTokenFilters.getRegistry()); } static Map setupPreConfiguredTokenizers(List plugins) { NamedRegistry preConfiguredTokenizers = new NamedRegistry<>("pre-configured tokenizer"); // Temporary shim to register old style pre-configured tokenizers for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) { String name = tokenizer.name().toLowerCase(Locale.ROOT); PreConfiguredTokenizer preConfigured; switch (tokenizer.getCachingStrategy()) { case ONE: preConfigured = PreConfiguredTokenizer.singleton(name, () -> tokenizer.create(Version.CURRENT)); break; default: throw new UnsupportedOperationException("Caching strategy unsupported by temporary shim [" + tokenizer + "]"); } preConfiguredTokenizers.register(name, preConfigured); } for (AnalysisPlugin plugin : plugins) { for (PreConfiguredTokenizer tokenizer : plugin.getPreConfiguredTokenizers()) { preConfiguredTokenizers.register(tokenizer.getName(), tokenizer); } } return unmodifiableMap(preConfiguredTokenizers.getRegistry()); } private NamedRegistry> setupTokenizers(List plugins) { NamedRegistry> tokenizers = new NamedRegistry<>("tokenizer"); tokenizers.register("standard", StandardTokenizerFactory::new); tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers); return tokenizers; } private NamedRegistry>> setupAnalyzers(List plugins) { NamedRegistry>> analyzers = new NamedRegistry<>("analyzer"); analyzers.register("default", StandardAnalyzerProvider::new); analyzers.register("standard", StandardAnalyzerProvider::new); analyzers.register("simple", SimpleAnalyzerProvider::new); analyzers.register("stop", StopAnalyzerProvider::new); analyzers.register("whitespace", WhitespaceAnalyzerProvider::new); analyzers.register("keyword", KeywordAnalyzerProvider::new); analyzers.extractAndRegister(plugins, AnalysisPlugin::getAnalyzers); return analyzers; } private NamedRegistry>> setupNormalizers(List plugins) { NamedRegistry>> normalizers = new NamedRegistry<>("normalizer"); normalizers.register("lowercase", LowercaseNormalizerProvider::new); // TODO: pluggability? return normalizers; } /** * The basic factory interface for analysis components. */ public interface AnalysisProvider { /** * Creates a new analysis provider. * * @param indexSettings the index settings for the index this provider is created for * @param environment the nodes environment to load resources from persistent storage * @param name the name of the analysis component * @param settings the component specific settings without context prefixes * @return a new provider instance * @throws IOException if an {@link IOException} occurs */ T get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException; /** * Creates a new global scope analysis provider without index specific settings not settings for the provider itself. * This can be used to get a default instance of an analysis factory without binding to an index. * * @param environment the nodes environment to load resources from persistent storage * @param name the name of the analysis component * @return a new provider instance * @throws IOException if an {@link IOException} occurs * @throws IllegalArgumentException if the provider requires analysis settings ie. if {@link #requiresAnalysisSettings()} returns * true */ default T get(Environment environment, String name) throws IOException { if (requiresAnalysisSettings()) { throw new IllegalArgumentException("Analysis settings required - can't instantiate analysis factory"); } return get(NA_INDEX_SETTINGS, environment, name, NA_INDEX_SETTINGS.getSettings()); } /** * If true the analysis component created by this provider requires certain settings to be instantiated. * it can't be created with defaults. The default is false. */ default boolean requiresAnalysisSettings() { return false; } } }