/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.ingest.attachment;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.opensearch.SpecialPermission;
import org.opensearch.bootstrap.FilePermissionUtils;
import org.opensearch.bootstrap.JarHell;
import org.opensearch.common.SuppressForbidden;
import org.opensearch.common.io.PathUtils;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.lang.reflect.ReflectPermission;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.AccessControlContext;
import java.security.AccessController;
import java.security.PermissionCollection;
import java.security.Permissions;
import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
import java.security.ProtectionDomain;
import java.security.SecurityPermission;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.PropertyPermission;
import java.util.Set;
/**
* Runs tika with limited parsers and limited permissions.
*
* Do NOT make public
*/
final class TikaImpl {
/** Exclude some formats */
private static final Set EXCLUDES = new HashSet<>(
Arrays.asList(
MediaType.application("vnd.ms-visio.drawing"),
MediaType.application("vnd.ms-visio.drawing.macroenabled.12"),
MediaType.application("vnd.ms-visio.stencil"),
MediaType.application("vnd.ms-visio.stencil.macroenabled.12"),
MediaType.application("vnd.ms-visio.template"),
MediaType.application("vnd.ms-visio.template.macroenabled.12"),
MediaType.application("vnd.ms-visio.drawing")
)
);
/** subset of parsers for types we support */
private static final Parser PARSERS[] = new Parser[] {
// documents
new org.apache.tika.parser.html.HtmlParser(),
new org.apache.tika.parser.pdf.PDFParser(),
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.rtf.RTFParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
new org.apache.tika.parser.epub.EpubParser(), };
/** autodetector based on this subset */
private static final AutoDetectParser PARSER_INSTANCE = new AutoDetectParser(PARSERS);
/** singleton tika instance */
private static final Tika TIKA_INSTANCE = new Tika(PARSER_INSTANCE.getDetector(), PARSER_INSTANCE);
/**
* parses with tika, throwing any exception hit while parsing the document
*/
static String parse(final byte content[], final Metadata metadata, final int limit) throws TikaException, IOException {
// check that its not unprivileged code like a script
SpecialPermission.check();
try {
return AccessController.doPrivileged(
(PrivilegedExceptionAction) () -> TIKA_INSTANCE.parseToString(new ByteArrayInputStream(content), metadata, limit),
RESTRICTED_CONTEXT
);
} catch (PrivilegedActionException e) {
// checked exception from tika: unbox it
Throwable cause = e.getCause();
if (cause instanceof TikaException) {
throw (TikaException) cause;
} else if (cause instanceof IOException) {
throw (IOException) cause;
} else {
throw new AssertionError(cause);
}
}
}
// apply additional containment for parsers, this is intersected with the current permissions
// its hairy, but worth it so we don't have some XML flaw reading random crap from the FS
private static final AccessControlContext RESTRICTED_CONTEXT = new AccessControlContext(
new ProtectionDomain[] { new ProtectionDomain(null, getRestrictedPermissions()) }
);
// compute some minimal permissions for parsers. they only get r/w access to the java temp directory,
// the ability to load some resources from JARs, and read sysprops
@SuppressForbidden(reason = "adds access to tmp directory")
static PermissionCollection getRestrictedPermissions() {
Permissions perms = new Permissions();
// property/env access needed for parsing
perms.add(new PropertyPermission("*", "read"));
perms.add(new RuntimePermission("getenv.TIKA_CONFIG"));
try {
// add permissions for resource access:
// classpath
addReadPermissions(perms, JarHell.parseClassPath());
// plugin jars
if (TikaImpl.class.getClassLoader() instanceof URLClassLoader) {
URL[] urls = ((URLClassLoader) TikaImpl.class.getClassLoader()).getURLs();
Set set = new LinkedHashSet<>(Arrays.asList(urls));
if (set.size() != urls.length) {
throw new AssertionError("duplicate jars: " + Arrays.toString(urls));
}
addReadPermissions(perms, set);
}
// jvm's java.io.tmpdir (needs read/write)
FilePermissionUtils.addDirectoryPath(
perms,
"java.io.tmpdir",
PathUtils.get(System.getProperty("java.io.tmpdir")),
"read,readlink,write,delete",
false
);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
// current hacks needed for POI/PDFbox issues:
perms.add(new SecurityPermission("putProviderProperty.BC"));
perms.add(new SecurityPermission("insertProvider"));
perms.add(new ReflectPermission("suppressAccessChecks"));
perms.add(new RuntimePermission("accessClassInPackage.sun.java2d.cmm.kcms"));
// xmlbeans, use by POI, needs to get the context classloader
perms.add(new RuntimePermission("getClassLoader"));
perms.setReadOnly();
return perms;
}
// add resources to (what is typically) a jar, but might not be (e.g. in tests/IDE)
@SuppressForbidden(reason = "adds access to jar resources")
static void addReadPermissions(Permissions perms, Set resources) throws IOException {
try {
for (URL url : resources) {
Path path = PathUtils.get(url.toURI());
if (Files.isDirectory(path)) {
FilePermissionUtils.addDirectoryPath(perms, "class.path", path, "read,readlink", false);
} else {
FilePermissionUtils.addSingleFilePath(perms, path, "read,readlink");
}
}
} catch (URISyntaxException bogus) {
throw new RuntimeException(bogus);
}
}
}