/* * SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. */ /* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.opensearch.hadoop.util; import org.opensearch.hadoop.util.BytesArray; import java.util.Arrays; public abstract class BytesUtils { /** * Counts the chars within a given, UTF-8 stream and matches the given char positions to the stream * byte positions, which are being returned. * * @param stream UTF-8 byte stream * @param charPositions char positions to be matched in the stream * @return byte positions matching the char ones */ // the algo is pretty simple: // 1. sorts out the char positions for easy matching // 2. iterate through the stream and count each char position // 3. match the byte positions back to the original char positions // Note that a Unicode code point is one char (in UTF-16 and thus Java) in BMP and two outside of it // this translates to up to 3 bytes in UTF-8 or 4 outside of it. However since method counts chars // and not code points, chars that indicate a surrogate (part of a non-BMP code point) are counted // as 2 bytes. public static int[] charToBytePosition(BytesArray ba, int... charPositions) { //StringBuilder sb = new StringBuilder(ba.length()); //String string = ba.toString(); int[] results = Arrays.copyOf(charPositions, charPositions.length); Arrays.fill(results, -1); int[] charOffsets = Arrays.copyOf(charPositions, charPositions.length); // sort positions (just in case) Arrays.sort(charOffsets); int charOffsetIndex = 0; int currentCharOffset = 0; int byteIndex = ba.offset; final int limit = ba.size; byte[] bytes = ba.bytes; boolean outsideBMP = false; while (byteIndex < limit) { int delta = 0; // it's within an outside BMP (plane 0) code point if (outsideBMP) { delta = 2; outsideBMP = false; } else { int b = bytes[byteIndex] & 0xff; delta = (b < 0xc0 ? 1 : b < 0xe0 ? 2 : b < 0xf0 ? 3 : 4); outsideBMP = (delta == 4); // break the byte stepping into the char surrogates if (outsideBMP) { delta = 2; } } while (charOffsets[charOffsetIndex] == currentCharOffset) { results[charOffsetIndex] = byteIndex; // pick the next char offset to look for if (charOffsetIndex + 1 < charOffsets.length) { charOffsetIndex++; } else { break; } } //sb.append("b=" + byteIndex + ";c=" + currentCharOffset + " -> " + string.charAt(currentCharOffset)); //sb.append("\n"); byteIndex += delta; currentCharOffset++; } //System.out.println(sb); // return the results according to the original char position // as there might be duplicates (which mess out sorting) do a copy int[] finalResults = Arrays.copyOf(results, results.length); for (int originalPosition = 0; originalPosition < charPositions.length; originalPosition++) { int sortedPosition = Arrays.binarySearch(charOffsets, charPositions[originalPosition]); finalResults[originalPosition] = results[sortedPosition]; } return finalResults; } /** * Removes the white space from the given byte array. White space is defined in the context of UTF-8 JSON * aka space, horizontal tab, line feed and carriage return. * * @param source * @param offset * @param length * @return */ public static int trimLeft(byte[] source, int start, int stop) { for (int i = start; i < stop; i++) { if (!isWhitespace(source[i])) { return i; } } return stop; } public static int trimRight(byte[] source, int start, int stop) { for (int i = stop; i > start; i--) { if (!isWhitespace(source[i])) { return i; } } return start; } private static boolean isWhitespace(byte current) { return current == 0x20 || current == 0x0d || current == 0x0a || current == 0x09; } }