{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "a1d65d37-6a53-4473-beae-4c7d8a2ea3fa", "metadata": { "tags": [] }, "outputs": [], "source": [ "import ray\n", "import pyarrow as pa\n", "\n", "ray.shutdown()\n", "ray.init(address=\"ray://localhost:10001\")" ] }, { "cell_type": "code", "execution_count": null, "id": "390b8312-fbd8-462f-9534-dd48e5173447", "metadata": { "tags": [] }, "outputs": [], "source": [ "def cast(t: pa.Table) -> pa.Table: # work around for large_string issue with PyArrow\n", " schema = t.schema\n", " field_idx = schema.get_field_index(\"review_body\")\n", " field = schema.field(field_idx)\n", " new_field = field.with_type(pa.large_string())\n", " new_schema = schema.set(field_idx, new_field)\n", " return t.cast(new_schema)\n", "\n", "ds = ray.data.read_parquet(\"s3://amazon-reviews-pds/parquet/product_category=Digital_Software/\", _block_udf=cast)\n", "\n", "ds.count()" ] }, { "cell_type": "code", "execution_count": null, "id": "406497ed-8b99-4bea-bde7-cbd32cfe5414", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }