DC works by spliting and sharing data across many machines
from pyarrow import csv, Table
pa_table = csv.read_csv('./data/bad.csv')
# or
pa_table = Table.from_pandas(pdDataFrame)
from pyarrow import csv, Table, parquet
# By default the pyArrow library uses snappy compression
parquet.write_table(pa_table, 'efficient.parquet')
# To really squeeze the data, use gzip compression.
parquet.write_table(pa_table, 'more_efficient.gzip.parquet',
compression='gzip')
from pyarrow import csv, Table, parquet
# Reading from a parquet file is multi-threaded
pa_table = parquet.read_table('efficient.parquet')
# convert back to pandas
df = pa_table.to_pandas()
Parquet is columnar, only columns you pick are read
That means a smaller amount of data is accessed/downloaded/parsed
pa_table_ids = pa.parquet.read_table('efficient.parquet',
columns=['id', 'last_name'])
df = pa_table_ids.to_pandas()
import pyarrow as pa
pq.write_to_dataset(table, root_path='dataset_name',
partition_cols=['one', 'two'])
This content was heavily inspired by the excellent documentation for the pyArrow project