U Dx`@sZddlZddlmZddlmZddlmZddlZddlZddl Z ddl Z ddl Z ddl ZddlmZmZmZiaddZdd Ze jd e jd e jd e jd e jde jde jde jde jde jde j ddde j!de j"diZ#ddZ$ddZ%ddZ&dd Z'd!d"Z(d#d$Z)d%d&Z*d'd(Z+d)d*Z,d+d,Z-d-d.Z.d/d0Z/d1d2Z0d3d4Z1dld5d6Z2dmd9d:Z3d;d<Z4d=d>Z5d?d@Z6dndAdBZ7dCdDZ8dodFdGZ9dHdIe j:dJe j:dKe j:dLdMd gDZ;dpdNdOZdTdUZ?dVdWZ@dXdYZAdZd[ZBdd\e j!e j"e jCe je j e jDd]ZEd^d_ZFd`daZGdbdcZHdddeZIdfdgZJdhdiZKdjdkZLdS)qN)Sequence)deepcopy) zip_longest) _pandas_apibuiltin_pickle frombytesc*Cststtjjdtjjdtjjdtjjdtjjdtjj dtjj dtjj dtjj d tjj d tjjd tjjd tjjd tjjdtjjdtjjdtjjdtjjdtjjdtjjditS)Nemptyboolint8int16int32int64uint8uint16uint32uint64float16float32float64datetimebytesunicode)_logical_type_mapupdatepalibZType_NAZ Type_BOOLZ Type_INT8Z Type_INT16Z Type_INT32Z Type_INT64Z Type_UINT8Z Type_UINT16Z Type_UINT32Z Type_UINT64ZType_HALF_FLOATZ Type_FLOATZ Type_DOUBLEZ Type_DATE32Z Type_DATE64Z Type_TIME32Z Type_TIME64Z Type_BINARYZType_FIXED_SIZE_BINARYZ Type_STRINGrrs z&construct_metadata..N)rFr+rGFlevelsnamesspandaspyarrow)Zlibraryversion) index_columnscolumn_indexescolumnsZcreatorZpandas_versionutf8)r;ziprNappendr&rOrFr9rY_get_simple_index_descriptorjsondumpsr __version__rrVencode)columns_to_convertdf column_names index_levelsindex_descriptorspreserve_indextypesZnum_serialized_index_levelsZntypesZdf_typesZ index_typesZcolumn_metadatacolZsanitized_namer+r@Zindex_column_metadatalevelZ descriptorrXrSrTrFrrrconstruct_metadatas\       rkcCsTt|\}}t|}d|kr,tjdtdd|dkrD|rtttt|St|trRtdn |dkr^dSt|S)aConvert a column name (or level) to either a string or a recursive collection of strings. Parameters ---------- name : str or tuple Returns ------- value : str or tuple Examples -------- >>> name = 'foo' >>> _column_name_to_strings(name) 'foo' >>> name = ('foo', 'bar') >>> _column_name_to_strings(name) ('foo', 'bar') >>> import pandas as pd >>> name = (1, pd.Timestamp('2017-02-01 00:00:00')) >>> _column_name_to_strings(name) ('1', '2017-02-01 00:00:00') rZz%Unsupported type for MultiIndex levelN) r&r2rdecodetuplemap_column_name_to_stringsrrJrFrrrrws      rwcCs(|jdk r|j|kr|jSd|SdS)zReturn the name of an index level or a default name if `index.name` is None or is already a column name. Parameters ---------- index : pandas.Index i : int Returns ------- name : str Nz__index_level_{:d}__)rFr')indexirdrrr_index_level_name6s r{cCs:t|||}|jjs(tdt|j|dk r.)r;r9r)rynrrrrsrcsP|dk r|dk rtdn2|dk r*|j}n"|dk rFfdd|D}nj}|S)NzJSchema and columns arguments are mutually exclusive, pass only one of themcsg|]}|jkr|qSr)rYrPcrcrrrRs z0_resolve_columns_of_interest..)rrTrY)rcrrYrrrr}s r}cCst|d||\}}}}}}}g} |D]} | j} t| rJtj| ddj} nZt| rltj| dddj} n8t | | j d\} } tj | | } | dkrtj| ddj} | | q$t||||||| } || | fS)NT) from_pandasr)rvaluesrZis_categoricalrarrayr0Zis_extension_array_dtypeheadget_datetimetz_typer/rZ_ndarray_to_arrow_typer\rk)rcrgrYrrd_rfrWrbrhrrtype_r@rrrdataframe_to_typess<    rTc st||||\}}}} } } } |dkrZt|t|j} }| |dkrV|dkrVt}nd}fdddd}|dkrfddt| | D}nd d lm}g}||J}t| | D]8\}}||j r| ||q| | ||qW5QRXt |D]$\}}t ||jr|||<qd d|D}|dkrg}t||D].\}}|dk rd|nd }| t||qNt|}t| ||| | ||}|jrt|jnt}||||}||fS) Ndrc s|dkrd}d}n |j}|j}ztj||dd}WnNtjtjtjfk r}z$|jd|j |j f7_|W5d}~XYnX|s|j dkrt dt ||j |S)NT)r0rsafez0Conversion failed for column {!s} with type {!s}rz>Field {} was non-nullable but pandas column had {} null values)Znullabler0rrZ ArrowInvalidZArrowNotImplementedErrorZArrowTypeErrorargsr'rFr/Z null_countrr2)rirZfield_nullablerr3e)rrrconvert_column2s,  z+dataframe_to_arrays..convert_columncSs$t|tjo"|jjo"t|jjtjSr) r&npZndarrayflags contiguous issubclassr/r0integer)arrrrr_can_definitely_zero_copyHs  z6dataframe_to_arrays.._can_definitely_zero_copycsg|]\}}||qSrr)rPrf)rrrrRNsz'dataframe_to_arrays..r)futurescSsg|] }|jqSrr0rPxrrrrR_srD)rr;rYr cpu_countr[Z concurrentrZThreadPoolExecutorrr\Zsubmitrr&ZFuturer3rrrkr@rrOr with_metadata)rcrrgZnthreadsrYrrrdrrfrWrbrZnrowsZncolsrZarraysrexecutorrrrzZ maybe_futrhfieldsrFrpandas_metadatar@r)rrrdataframe_to_arrayssf           rcCs^|jjtjkr||fSt|rB|dkrB|j}|j}t ||}n|dkrVt |j}||fSr) r/r0rZ datetime64rZ is_datetimetzr*unitr timestampZfrom_numpy_dtype)rr/rr*rrrrrss rcCsddlmm}|j}g}dd|jD}|jD]}|j}i}t||jrpt j |j |d<t |dr|j}n$t||jr|j|j|jd|j}|j|jj|dt||jkrd|d<tj|tjd |d <||q2||d S) NrcSsg|]}|qSrr)rPZaxrrrrRsz0dataframe_to_serialized_dict..r8r) dictionaryr7) placementblockr#)protocolr)blocksaxes)pandas.core.internalscore internals_datarrrr&DatetimeTZBlockrrr>r*r1CategoricalBlockrr<r7r=Zmgr_locsZas_arrayr0 ObjectBlockrr_HIGHEST_PROTOCOLr\)frame_intZ block_managerrrrrZ block_datarrrdataframe_to_serialized_dicts<      rcCs>ddlmm}dd|dD}|||d}t|S)NrcSsg|] }t|qSr_reconstruct_block)rPrrrrrRsz0serialized_dict_to_dataframe..rr)rrr BlockManagerrZ data_frame)datarZreconstructed_blocksZ block_mgrrrrserialized_dict_to_dataframes rc Cs ddlmm}|dd}|d}d|kr\tjj||d|dd}|j|||jd}nd |krt |d }|j|||j |d }nd |kr|jt |||j d}ntd |kr|d } t|d kst||d} || } t| dstd| | } |j| ||jd}n|j||d}|S)a Construct a pandas Block from the `item` dictionary coming from pyarrow's serialization or returned by arrow::python::ConvertTableToPandas. This function takes care of converting dictionary types to pandas categorical, Timestamp-with-timezones to the proper pandas Block, and conversion to pandas ExtensionBlock Parameters ---------- item : dict For basic types, this is a dictionary in the form of {'block': np.ndarray of values, 'placement': pandas block placement}. Additional keys are present for other types (dictionary, timezone, object). columns : Column names of the table being constructed, used for extension types extension_columns : dict Dictionary of {column_name: pandas_dtype} that includes all columns and corresponding dtypes that will be converted to a pandas ExtensionBlock. Returns ------- pandas Block rNrrrr7)r<r7)rklassr8)rrr/r#Zpy_arrayr__from_arrow__zGThis column does not support to be converted to a pandas ExtensionArray)r)rrrgetrZcategorical_typeZ from_codesZ make_blockrmake_datetimetzrrloadsrr;r:r1rrZExtensionBlock) itemrYextension_columnsrZ block_arrrr6rr/rrF pandas_dtypeZ pd_ext_arrrrrrsH       rcCstj|}tjd|dS)Nnsr*)rrstring_to_tzinforZdatetimetz_typerrrrrs rFcCsddlm}g}g}|jj}|sl|dk rl|d}|dg}|d} t||}t|| |\}} t|||} ntj |j } t|g|} t |t |||} t|||| } | | g}|| |S)Nr)rrYrXrW)rrrrr_add_any_metadata_reconstruct_index_get_extension_dtypesrrrnum_rows'_check_data_column_metadata_consistency_deserialize_column_index_table_to_blocks)optionstabler<Zignore_metadata types_mapperr all_columnsrXrrfryZext_columns_dtypesrYrrrrrtable_to_blockmanagers0      rcCsh|]}tt|qSr)r2rr/)rPtyprrr srrZuintfloatr#c Csi}tjdkr|S|D]D}|d}|d}|tkrt|}t|tjrt|dr|||<q|jD]B}|j} t| tj rbz | }Wnt k rYqbX|||j <qb|r|jD]$}|j} || }|dk r|||j <q|S)a Based on the stored column pandas metadata and the extension types in the arrow schema, infer which columns should be converted to a pandas extension dtype. The 'numpy_type' field in the column metadata stores the string representation of the original pandas dtype (and, despite its name, not the 'pandas_type' field). Based on this string representation, a pandas/numpy dtype is constructed and then we can check if this dtype supports conversion from arrow. NrFrIr) rZextension_dtype_pandas_supported_numpy_typesrr&r1rr0rZBaseExtensionTypeZto_pandas_dtypeNotImplementedErrorrF) rZcolumns_metadatarZ ext_columnscol_metarFr/rrrrrrr's4            rcCstdd|DstdS)Ncss.|]&}|ddkrd|kp$|ddk VqdS)rFNrGrrrrr asz:_check_data_column_metadata_consistency..)allr:)rrrrr\srcsdd|jD}|r6dd|Dfdd|D}n|}t|dkrLtjndd}|sftj|}n*tjjjt t ||d d|Dpdd }t|d krt ||}t |}|S) NcSs"g|]}t|trt|n|qSr)r&rrrrrrrRhsz-_deserialize_column_index..cSs&i|]}|dt|d|dqSrGrF)rrwrrrr ksz-_deserialize_column_index..csg|]}||qSrr)rPrFZcolumns_name_dictrrrRosrcSs|fSrr)rrrrxz+_deserialize_column_index..cSsg|] }|dqSrxr)rP col_indexrrrrRsrTr) rdr;ast literal_evalrrIndex MultiIndex from_tuplesrrv"_reconstruct_columns_from_metadata _flatten_single_level_multiindex) block_tablerrXZcolumn_stringsZcolumns_valuesZto_pairrYrrrrgs,    rc Cs,dd|D}g}g}|}|D]}t|trLt||||\}}} |dkrqnZ|ddkr|d} tjj|d|d|d| d }t|t|krqntd |d| || | qtj} t|d kr| j j ||d } nDt|d kr|d } t| | j s$| j | |d d} n | |j } || fS)NcSsi|]}|d|d|qSrrrrrrrsz&_reconstruct_index..rrrFrrr)rrFzUnrecognized index kind: {}rrrrx)r&r2_extract_index_levelrrrr;rr'r\r from_arraysrr) rrfrfield_name_to_metadataZ index_arraysZ index_names result_tablerQr index_namerryrrrrsN        rc Cs||d}t||}|j|}|dkr4|ddfStj}||}|j} t| drh| j j sh| } t |j tjjr|j jdk rt|| |j j} n|j| | jd} ||j|}|| |fS)NrFr)r/) _backwards_compatible_index_namerget_field_indexrrr? to_pandasrr1rZ writeablecopyr&r0rrr)r* make_tz_awareZSeriesr/Z remove_column) rr rGr  logical_namer rzrrirrrrrr s"       r cCs||krt|rdS|SdS)a1Compute the name of an index column that is compatible with older versions of :mod:`pyarrow`. Parameters ---------- raw_name : str logical_name : str Returns ------- result : str Notes ----- * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager` N)r)raw_namerrrrrsrcCsd}t||dk S)Nz^__index_level_\d+__$)rematch)rFpatternrrrrsrr-)rr!rrr,rZfloatingrcCs@z t|WStk r:d|kr,tjYSt|YSXdS)a Get the numpy dtype that corresponds to a pandas type. Parameters ---------- pandas_type : str The result of a call to pandas.lib.infer_dtype. Returns ------- dtype : np.dtype The dtype that corresponds to `pandas_type`. rlN)_pandas_logical_type_mapr%robject_r/)rHrrr_pandas_type_to_numpy_type s   rcCs,t|tjjr$t|dr|jS|jSdSdS)Nr=)r&rrrr1r=labels)mirrr_get_multiindex_codes"src stjt|ddp|g}t|p0fdd|D}ddt||idD}g}tdd}|D]Z\}}} t|} | tj kr| |}n|j | kr| | }|j | kr| | }| |q^j|||jd S) a_Construct a pandas MultiIndex from `columns` and column index metadata in `column_indexes`. Parameters ---------- columns : List[pd.Index] The columns coming from a pyarrow.Table column_indexes : List[Dict[str, str]] The column index metadata deserialized from the JSON schema metadata in a :class:`~pyarrow.Table`. Returns ------- result : MultiIndex The index reconstructed using `column_indexes` metadata with levels of the correct type. Notes ----- * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager` rSNcsg|]}t|qSr)rr;)rPrjrrrrRDsz6_reconstruct_columns_from_metadata..cSs0g|](\}}||dt|j|ddfqS)rHrIN)rr2r/)rPrjrrrrrRIs ) fillvaluerarpr)rrr9rroperator methodcallerrrbytes_rvr/Zastyper\rrT) rYrXrSrZ levels_dtypesZ new_levelsencoderrjrZ numpy_dtyper/rrrr*s0        rcs4|jtj|||t}fdd|DS)Ncsg|]}t|qSrr)rPrrYrrrrRmsz$_table_to_blocks..)rdrrZtable_to_blocksrkeys)rrr<rr3rr%rrfs   rcsjtj}t||jrf|jdkrf|j\t|\}j}|jsBt d|j fdd|D||j ddS|S)NrzFound non-unique column indexcs g|]}|dkr|ndqS)rNr)rPZ_labelrSrrrR~sz4_flatten_single_level_multiindex..r)r/rF) rrr&rZnlevelsrSrr/r~rrrT)ryrrr/rr'rrqs rcCsi}i}|j}|d}dd|D}t|}t|d|}t|dD]\}} | d} | s| d} ||krz|||} | dkrd} || } | dkrH| d d krH|| } t| jtjj sqH| d } | sqH| d }|rH|| jj krH| }tj d |d}tj j||d}t|| j||| <||| <qHt|dkrg}g}tt|jD]L}||kr|||||||n|||||j|qPtjj|t|dS|SdS)NrWcSsg|]}t|tr|qSr)r&r2)rPZidx_colrrrrRs z%_add_any_metadata..rYrGrFrDrrHr r@r8rrrr)r)rr;rrrr&r0rrr)r*rrArrayrrrFrr\ZTabler )rrZmodified_columnsZmodified_fieldsrrWZn_index_levelsZ n_columnsrzrridxrir@Z metadata_tzZ convertedZ tz_aware_typerrYrrrrrs\        rcCs$tj|}|jdj|}|S)zB Make a datetime64 Series timezone-aware for the given tz utc)rrrdtZ tz_localizeZ tz_convert)Zseriesr*rrrrs  r)N)rNT)NN)NFN)N)Mrcollections.abcrrr itertoolsrr^r!rrqnumpyrrUrZ pyarrow.librrrrrr(Zbool_r r r r rrrrrrZunicode_r#r.r4rArNrkr]rwr{rrrrrrr}rrrrrrrrZsctypesrrrrrr rrZstr_rrrrrrrrrrrrrs   -G'@@     V( @   5 (6 < A