// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // clang-format off #include "stdafx.h" // clang-format on #include using namespace std; using namespace frantic; using frantic::channels::channel_map; using frantic::channels::data_type_t; namespace frantic { namespace particles { namespace streams { namespace { void guess_native_channel_map_from_numeric_headers( const std::vector& columnHeaders, bool noError, data_type_t positionTypeHint, frantic::channels::channel_column_map& outGuess ) { frantic::channels::channel_map channelMap; if( !noError && columnHeaders.size() < 3 ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream : CSV particle file must have at least 3 columns for position information."; } else { // Always specifies a position channel with xyz components channelMap.define_channel( _T("Position"), 3, ( positionTypeHint == channels::data_type_invalid ) ? channels::data_type_float32 : positionTypeHint ); if( columnHeaders.size() == 6 ) { channelMap.define_channel( _T("Color"), 3, frantic::channels::data_type_float16 ); } else { // Otherwise, remaining columns with Data1, Data2, etc... for( size_t i = 3; i < columnHeaders.size(); ++i ) { channelMap.define_channel( _T( "Data" ) + boost::lexical_cast( i - 2 ), 1, frantic::channels::data_type_float32 ); } } } channelMap.end_channel_definition(); outGuess.reset( channelMap, columnHeaders.size() ); for( size_t i = 0; i < 3; ++i ) { outGuess.set_column_mapping( i, _T("Position"), i ); } if( columnHeaders.size() == 6 ) { for( size_t i = 0; i < 3; ++i ) { outGuess.set_column_mapping( 3 + i, _T("Color"), i ); } } else { for( size_t i = 3; i < columnHeaders.size(); ++i ) { outGuess.set_column_mapping( i, _T( "Data" ) + boost::lexical_cast( i - 2 ), 0 ); } } } /** * Holds the data type and optionally the index of a numbered column. */ struct column_mapping { column_mapping() {} column_mapping( size_t _column, frantic::channels::data_type_t _type ) : column( _column ) , type( _type ) , hasIndex( false ) {} column_mapping( size_t _column, frantic::channels::data_type_t _type, size_t _index ) : column( _column ) , type( _type ) , hasIndex( true ) , index( _index ) {} size_t column; frantic::channels::data_type_t type; bool hasIndex; size_t index; bool operator<( const column_mapping& other ) const { return index < other.index; } }; /** * Attempts to construct a channel map and associated column mappings to it. This allows interleaved channels, and * 'gaps' in the data * * The guessing method assumes columns are in the form "type name[index]" * - "type" is optional on all columns, however any columns with the same "name" should match the type of all other * columns with that name, or be blank * -- If all columns of the same name are blank, the type will default to float32 * - "[index]" is optional on all columns * -- If indices are omitted for all columns, their order and arity is implicit by the order they appear * -- If indices are given for all columns, they must be contiguous, and either 0 or 1-based indices * -- It is an error if only some indices are specified * * These rules are effectively arbitrary, if any changes seem reasonable to you, go for it. Note that these rules are * loosened in 'noError' mode to prevent the method from ever failing, at the cost of accepting weird inputs with weird * (though still consistent) results. * * \param columnHeaders Set of strings at the head of the csv file * \param file Name of the file which the data is loaded from * \param noError Loosens the rules to prevent the method ever throwing any errors. Used to present the user with a best * guess and lets them refine it later * \param positionTypeHint The dtype (data_type_float32 or data_type_float64) that the Position channel should have. * Use data_type_invalid to let the function choose. \param outGuess The resulting channel column mapping for this csv * file * */ void guess_native_channel_map_from_text_headers( const std::vector& columnHeaders, const frantic::tstring& file, bool noError, frantic::channels::data_type_t positionTypeHint, frantic::channels::channel_column_map& outGuess ) { typedef std::map> column_map; column_map columnMappings; vector columnNames; std::vector headerTokens; for( unsigned i = 0; i < columnHeaders.size(); ++i ) { headerTokens.clear(); frantic::strings::split( columnHeaders[i], headerTokens ); // I think its reasonable to say that an empty column can be treated like a gap if( headerTokens.empty() ) { continue; } if( !noError && headerTokens.size() > 2 ) throw frantic::invalid_particle_file_exception() << "csv_particle_istream: The input file \"" + frantic::strings::to_string( file ) + "\" has an invalid column header entry, \"" + columnHeaders[i] + "\"."; frantic::tstring channelName; column_mapping mapping; mapping.column = i; mapping.type = frantic::channels::data_type_invalid; mapping.hasIndex = false; if( headerTokens.size() == 1 ) { channelName = frantic::strings::to_tstring( headerTokens[0] ); if( mapping.type == frantic::channels::data_type_invalid ) { // "If the float32/float16 definition is missing, the type will default to float32." mapping.type = frantic::channels::data_type_float32; } } else { mapping.type = frantic::channels::channel_data_type_from_string( headerTokens[0] ); if( mapping.type == frantic::channels::data_type_invalid ) { if( !noError ) { throw frantic::invalid_particle_file_exception( "csv_particle_istream: The input file \"" + frantic::strings::to_string( file ) + "\" has an invalid column header type, \"" + headerTokens[0] + "\". Some examples of valid type names are \"float32\" and \"int16\"." ); } } channelName = frantic::strings::to_tstring( headerTokens[1] ); } size_t startBracketIndex = channelName.find( '[' ); frantic::tstring indexNumberStr; int indexNumber = -1; // The indexing bracket can't be at index 0, there must be a channel name in there too if( startBracketIndex == 0 ) { if( noError ) { continue; } else { throw frantic::invalid_particle_file_exception( "csv_particle_istream: The input file \"" + frantic::strings::to_string( file ) + "\" has an invalid column header name, \"" + columnHeaders[i] + "\". There must be a channel name, not just an index." ); } } // If there are indexing brackets, extract the index number else if( startBracketIndex != frantic::tstring::npos ) { size_t endBracketIndex = channelName.find( ']', startBracketIndex ); if( endBracketIndex != channelName.size() - 1 ) { if( noError ) { if( endBracketIndex < startBracketIndex || endBracketIndex == frantic::tstring::npos ) { endBracketIndex = channelName.size(); } } else { throw frantic::invalid_particle_file_exception( "csv_particle_istream: The input file \"" + frantic::strings::to_string( file ) + "\" has an invalid column header name, \"" + columnHeaders[i] + "\". The index brackets are incorrect, they should be like \"Channel[1]\"." ); } } else { // Extract the index indexNumberStr = channelName.substr( startBracketIndex + 1, endBracketIndex - startBracketIndex - 1 ); // Strip off the indexing brackets from the name channelName.resize( startBracketIndex ); } } // Apply the position data type if provided if( positionTypeHint != channels::data_type_invalid && channelName == _T("Position") ) { mapping.type = positionTypeHint; } if( indexNumberStr.empty() ) { indexNumber = -1; } else { try { indexNumber = boost::lexical_cast( indexNumberStr ); } catch( boost::bad_lexical_cast e ) { if( !noError ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: The input file \"" << frantic::strings::to_string( file ) << "\" has an invalid column header name, \"" << columnHeaders[i] << "\". The number inside the index brackets is incorrect, it must be a valid integer.)"; } } if( !noError && indexNumber < 0 ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: The input file \"" << frantic::strings::to_string( file ) << "\" has an invalid column header name, \"" << columnHeaders[i] << "\". The number inside the index brackets is incorrect, it must be a nonnegative integer."; } } if( indexNumber >= 0 ) { mapping.hasIndex = true; mapping.index = size_t( indexNumber ); } if( !frantic::channels::is_valid_channel_name( channelName ) ) { if( noError ) { continue; } else { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: The input file \"" << frantic::strings::to_string( file ) << "\" has an invalid column header name, \"" << columnHeaders[i] << "\". Names must have alphanumeric characters and \'_\' only."; } } // Keep track of the column names in order so we can preserve the ordering from the CSV (somewhat) if( columnMappings.find( channelName ) == columnMappings.end() ) columnNames.push_back( channelName ); columnMappings[channelName].push_back( mapping ); } frantic::channels::channel_map channelMap; for( vector::const_iterator it = columnNames.begin(); it != columnNames.end(); ++it ) { const frantic::tstring& channelName = *it; std::vector& channelColumns = columnMappings[channelName]; frantic::channels::data_type_t typeGuess = frantic::channels::data_type_invalid; std::set indexMapping; for( size_t j = 0; j < channelColumns.size(); ++j ) { if( typeGuess == frantic::channels::data_type_invalid ) { typeGuess = channelColumns[j].type; } else if( !noError ) { if( channelColumns[j].type != frantic::channels::data_type_invalid && channelColumns[j].type != typeGuess ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: Data type for column \"" << columnHeaders[channelColumns[j].column] << "\" does not match previous column for channel \"" << frantic::strings::to_string( channelName ) << "\"."; } } if( channelColumns[j].hasIndex ) { // Duplicate indices on the same channel is an error if( !noError && indexMapping.find( channelColumns[j].index ) != indexMapping.end() ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: Duplicate index " << channelColumns[j].index << " for channel \"" << frantic::strings::to_string( channelName ) << "\" found in column " << j << "."; } indexMapping.insert( channelColumns[j].index ); } else if( !noError && !indexMapping.empty() ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: Mixed index specification found for channel \"" << frantic::strings::to_string( channelName ) << "\". Indices must be specified either for all columns of a given channel, or none."; } } if( typeGuess == frantic::channels::data_type_invalid ) { typeGuess = frantic::channels::data_type_float32; } // Assign ordering if not given, or attempt to salvage an ordering in noError mode if( noError || indexMapping.empty() ) { size_t currentIndex = 0; for( size_t j = 0; j < channelColumns.size(); ++j ) { if( !channelColumns[j].hasIndex ) { while( indexMapping.find( currentIndex ) != indexMapping.end() ) { ++currentIndex; } channelColumns[j].index = currentIndex; indexMapping.insert( currentIndex ); ++currentIndex; } } } std::stable_sort( channelColumns.begin(), channelColumns.end() ); // attempt to stably realign duplicates if we don't want to just error out if( noError ) { for( size_t j = 0; j < channelColumns.size(); ++j ) { channelColumns[j].index = j; } } else { bool oneBased = channelColumns[0].index == 1; for( size_t j = 0; j < channelColumns.size(); ++j ) { if( oneBased ) { --channelColumns[j].index; } if( channelColumns[j].index != j ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: Index specifications for channel \"" << frantic::strings::to_string( channelName ) << "\" were not a contiguous 1 or 0-based list."; } } } channelMap.define_channel( channelName, channelColumns.size(), typeGuess ); } channelMap.end_channel_definition(); outGuess.reset( channelMap, columnHeaders.size() ); for( std::map>::iterator it = columnMappings.begin(); it != columnMappings.end(); ++it ) { // Since we already sorted each channel's columns by index, we can assign them in order for( size_t j = 0; j < it->second.size(); ++j ) { if( it->second[j].type == frantic::channels::data_type_invalid ) { if( !noError ) { throw std::logic_error( "csv_particle_istream: an invalid data type was detected in column " + boost::lexical_cast( it->second[j].column ) ); } } outGuess.set_column_mapping( it->second[j].column, it->first, it->second[j].index ); } } } } // anonymous namespace csv_particle_istream::csv_format_guess csv_particle_istream::guess_channel_column_map( const std::vector& columnHeaders, const frantic::tstring& file, bool noError, data_type_t positionTypeHint, channel_column_map& outMapping ) { bool anyNumbers = false, allNumbers = true; for( unsigned i = 0; i < columnHeaders.size(); ++i ) { std::string entry = frantic::strings::trim( columnHeaders[i] ); const char* str = entry.c_str(); char* end = 0; frantic::locale::strtod_c( str, &end ); if( end == str + entry.size() ) anyNumbers = true; else allNumbers = false; } csv_format_guess guessType = CSV_PARTICLE_GUESS_UNKNOWN; if( allNumbers ) { guess_native_channel_map_from_numeric_headers( columnHeaders, noError, positionTypeHint, outMapping ); guessType = CSV_PARTICLE_GUESS_NUMBERS; } else if( anyNumbers ) { if( !noError ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: The input file \"" + frantic::strings::to_string( file ) + "\" is not a valid particle .csv file. The column header must not contain any numeric values."; } guessType = CSV_PARTICLE_GUESS_MIXED; } else { guess_native_channel_map_from_text_headers( columnHeaders, file, noError, positionTypeHint, outMapping ); guessType = CSV_PARTICLE_GUESS_HEADERS; } return guessType; } void csv_particle_istream::initialize_stream( const frantic::tstring& file, const channel_column_map& expectedFormat, char delimiter, int headerRowCount, data_type_t positionTypeHint, bool noError ) { m_name = file; m_fin.reset( frantic::files::tfopen( file.c_str(), _T("rb") ) ); if( !m_fin ) throw std::runtime_error( "csv_particle_istream: Failed to open file \"" + frantic::strings::to_string( file ) + "\" for reading." ); if( delimiter == '\0' ) { bool foundDelimiter = guess_csv_delimiter( m_fin, file, m_delimiter ); if( !foundDelimiter ) { m_delimiter = ','; FF_LOG( debug ) << "csv_particle_istream: In file \"" << name() << "\": Unable to find delimiter. Using \'" << boost::lexical_cast( m_delimiter ) << "\'." << std::endl; } else { FF_LOG( debug ) << "csv_particle_istream: In file \"" << name() << "\": Using delimiter \'" << boost::lexical_cast( m_delimiter ) << "\'." << std::endl; } std::fseek( m_fin, 0, SEEK_SET ); } else { m_delimiter = delimiter; } // Load in the header std::vector columnHeaders; m_csvReader = get_csv_reader( m_fin, file, false, m_delimiter ); m_csvReader.read_line( columnHeaders ); if( columnHeaders.empty() ) throw frantic::invalid_particle_file_exception() << "csv_particle_istream: The input file \"" + frantic::strings::to_string( file ) + "\" is not a valid particle .csv file. The first line must not be empty."; if( expectedFormat.is_empty() ) { csv_format_guess guessType = guess_channel_column_map( columnHeaders, file, noError, positionTypeHint, m_channelColumnMapping ); if( guessType == CSV_PARTICLE_GUESS_NUMBERS ) { headerRowCount = 0; } else { headerRowCount = 1; } } else { m_channelColumnMapping = expectedFormat; m_channelColumnMapping.remove_trailing_columns(); if( m_channelColumnMapping.column_count() > columnHeaders.size() ) { throw frantic::invalid_particle_file_exception() << "csv_particle_istream: Suggested particle format has more columns than in file \"" + frantic::strings::to_string( file ) + "\". " << "Suggested format has " << boost::lexical_cast( m_channelColumnMapping.column_count() ) + ", " << "but file has only " + boost::lexical_cast( columnHeaders.size() ) + "."; } } if( headerRowCount <= 0 ) { std::clearerr( m_fin ); } m_headerRowCount = headerRowCount; m_fin.close(); m_finReopened = false; } void csv_particle_istream::initialize_stream( const frantic::tstring& file, char delimiter, data_type_t positionTypeHint, bool noError ) { initialize_stream( file, channel_column_map(), delimiter, -1, positionTypeHint, noError ); } } // namespace streams } // namespace particles } // namespace frantic