Usage¶

Row-Oriented Reading¶

The RowReader provides a convenient row-oriented interface for reading Parquet files with typed accessor methods for type-safe field access.

import dev.hardwood.reader.ParquetFileReader;
import dev.hardwood.reader.RowReader;
import dev.hardwood.row.PqStruct;
import dev.hardwood.row.PqList;
import dev.hardwood.row.PqIntList;
import dev.hardwood.row.PqMap;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalTime;
import java.math.BigDecimal;
import java.util.UUID;

try (ParquetFileReader fileReader = ParquetFileReader.open(path);
    RowReader rowReader = fileReader.createRowReader()) {

    while (rowReader.hasNext()) {
        rowReader.next();

        // Access columns by name with typed accessors
        long id = rowReader.getLong("id");
        String name = rowReader.getString("name");

        // Logical types are automatically converted
        LocalDate birthDate = rowReader.getDate("birth_date");
        Instant createdAt = rowReader.getTimestamp("created_at");
        LocalTime wakeTime = rowReader.getTime("wake_time");
        BigDecimal balance = rowReader.getDecimal("balance");
        UUID accountId = rowReader.getUuid("account_id");

        // Check for null values
        if (!rowReader.isNull("age")) {
            int age = rowReader.getInt("age");
            System.out.println("ID: " + id + ", Name: " + name + ", Age: " + age);
        }

        // Access nested structs
        PqStruct address = rowReader.getStruct("address");
        if (address != null) {
            String city = address.getString("city");
            int zip = address.getInt("zip");
        }

        // Access lists and iterate with typed accessors
        PqList tags = rowReader.getList("tags");
        if (tags != null) {
            for (String tag : tags.strings()) {
                System.out.println("Tag: " + tag);
            }
        }
    }
}

Advanced: nested lists, maps, and list-of-structs

        // Access list of structs
        PqList contacts = rowReader.getList("contacts");
        if (contacts != null) {
            for (PqStruct contact : contacts.structs()) {
                String contactName = contact.getString("name");
                String phone = contact.getString("phone");
            }
        }

        // Access nested lists (list<list<int>>) using primitive int lists
        PqList matrix = rowReader.getList("matrix");
        if (matrix != null) {
            for (PqIntList innerList : matrix.intLists()) {
                for (var it = innerList.iterator(); it.hasNext(); ) {
                    int val = it.nextInt();
                    System.out.println("Value: " + val);
                }
            }
        }

        // Access maps (map<string, int>)
        PqMap attributes = rowReader.getMap("attributes");
        if (attributes != null) {
            for (PqMap.Entry entry : attributes.getEntries()) {
                String key = entry.getStringKey();
                int value = entry.getIntValue();
                System.out.println(key + " = " + value);
            }
        }

        // Access maps with struct values (map<string, struct>)
        PqMap people = rowReader.getMap("people");
        if (people != null) {
            for (PqMap.Entry entry : people.getEntries()) {
                String personId = entry.getStringKey();
                PqStruct person = entry.getStructValue();
                String personName = person.getString("name");
                int personAge = person.getInt("age");
            }
        }

Typed Accessor Methods¶

All accessor methods are available in two forms:

Name-based (e.g., getInt("column_name")) — convenient for ad-hoc access
Index-based (e.g., getInt(columnIndex)) — faster for performance-critical loops

Method	Physical/Logical Type	Java Type
`getBoolean(name)` / `getBoolean(index)`	BOOLEAN	`boolean`
`getInt(name)` / `getInt(index)`	INT32	`int`
`getLong(name)` / `getLong(index)`	INT64	`long`
`getFloat(name)` / `getFloat(index)`	FLOAT	`float`
`getDouble(name)` / `getDouble(index)`	DOUBLE	`double`
`getBinary(name)` / `getBinary(index)`	BYTE_ARRAY	`byte[]`
`getString(name)` / `getString(index)`	STRING logical type	`String`
`getDate(name)` / `getDate(index)`	DATE logical type	`LocalDate`
`getTime(name)` / `getTime(index)`	TIME logical type	`LocalTime`
`getTimestamp(name)` / `getTimestamp(index)`	TIMESTAMP logical type	`Instant`
`getDecimal(name)` / `getDecimal(index)`	DECIMAL logical type	`BigDecimal`
`getUuid(name)` / `getUuid(index)`	UUID logical type	`UUID`
`getStruct(name)`	Nested struct	`PqStruct`
`getList(name)`	LIST logical type	`PqList`
`getMap(name)`	MAP logical type	`PqMap`
`isNull(name)` / `isNull(index)`	Any	`boolean`

Index-based access example:

// Get column indices once (before the loop)
int idIndex = fileReader.getFileSchema().getColumn("id").columnIndex();
int nameIndex = fileReader.getFileSchema().getColumn("name").columnIndex();

while (rowReader.hasNext()) {
    rowReader.next();
    if (!rowReader.isNull(idIndex)) {
        long id = rowReader.getLong(idIndex);      // No name lookup per row
        String name = rowReader.getString(nameIndex);
    }
}

Type validation: The API validates at runtime that the requested type matches the schema. Mismatches throw IllegalArgumentException with a descriptive message.

Column Projection¶

Column projection allows reading only a subset of columns from a Parquet file, improving performance by skipping I/O, decoding, and memory allocation for unneeded columns.

import dev.hardwood.schema.ColumnProjection;
import dev.hardwood.reader.ParquetFileReader;
import dev.hardwood.reader.RowReader;

try (ParquetFileReader fileReader = ParquetFileReader.open(path);
     RowReader rowReader = fileReader.createRowReader(
         ColumnProjection.columns("id", "name", "created_at"))) {

    while (rowReader.hasNext()) {
        rowReader.next();

        // Access projected columns normally
        long id = rowReader.getLong("id");
        String name = rowReader.getString("name");
        Instant createdAt = rowReader.getTimestamp("created_at");

        // Accessing non-projected columns throws IllegalArgumentException
        // rowReader.getInt("age");  // throws "Column not in projection: age"
    }
}

Projection options:

ColumnProjection.all() — read all columns (default)
ColumnProjection.columns("id", "name") — read specific columns by name
ColumnProjection.columns("address") — select an entire struct and all its children
ColumnProjection.columns("address.city") — select a specific nested field (dot notation)

Reading Multiple Files¶

When processing multiple Parquet files, use the Hardwood class to share a thread pool across readers.

Unified Multi-File Reader¶

For reading multiple files as a single logical dataset, use openAll() which returns a MultiFileParquetReader. From there, create a row reader or column readers:

import dev.hardwood.Hardwood;
import dev.hardwood.reader.MultiFileParquetReader;
import dev.hardwood.reader.MultiFileRowReader;

List<Path> files = List.of(
    Path.of("data_2024_01.parquet"),
    Path.of("data_2024_02.parquet"),
    Path.of("data_2024_03.parquet")
);

try (Hardwood hardwood = Hardwood.create();
     MultiFileParquetReader parquet = hardwood.openAll(files);
     MultiFileRowReader reader = parquet.createRowReader()) {

    while (reader.hasNext()) {
        reader.next();
        // Access data using the same API as single-file RowReader
        long id = reader.getLong("id");
        String name = reader.getString("name");
    }
}

The MultiFileRowReader provides cross-file prefetching: when pages from file N are running low, pages from file N+1 are already being prefetched. This eliminates I/O stalls at file boundaries.

With column projection:

try (Hardwood hardwood = Hardwood.create();
     MultiFileParquetReader parquet = hardwood.openAll(files);
     MultiFileRowReader reader = parquet.createRowReader(
         ColumnProjection.columns("id", "name", "amount"))) {

    while (reader.hasNext()) {
        reader.next();
        // Only projected columns are read
    }
}

Column-Oriented Reading (ColumnReader)¶

The ColumnReader provides batch-oriented columnar access with typed primitive arrays, avoiding per-row method calls and boxing. This is the fastest way to consume Parquet data when you process columns independently.

Single-File Column Reading¶

import dev.hardwood.reader.ParquetFileReader;
import dev.hardwood.reader.ColumnReader;

try (ParquetFileReader reader = ParquetFileReader.open(path)) {
    // Create a column reader by name (spans all row groups automatically)
    try (ColumnReader fare = reader.createColumnReader("fare_amount")) {
        double sum = 0;
        while (fare.nextBatch()) {
            int count = fare.getRecordCount();
            double[] values = fare.getDoubles();
            BitSet nulls = fare.getElementNulls(); // null if column is required

            for (int i = 0; i < count; i++) {
                if (nulls == null || !nulls.get(i)) {
                    sum += values[i];
                }
            }
        }
    }
}

Typed accessors are available for each physical type: getInts(), getLongs(), getFloats(), getDoubles(), getBooleans(), getBinaries(), and getStrings(). Column readers can also be created by index via createColumnReader(int columnIndex).

Multi-File Column Reading¶

For reading columns across multiple files with cross-file prefetching, use MultiFileColumnReaders:

import dev.hardwood.Hardwood;
import dev.hardwood.reader.MultiFileParquetReader;
import dev.hardwood.reader.MultiFileColumnReaders;
import dev.hardwood.reader.ColumnReader;
import dev.hardwood.schema.ColumnProjection;

try (Hardwood hardwood = Hardwood.create();
     MultiFileParquetReader parquet = hardwood.openAll(files);
     MultiFileColumnReaders columns = parquet.createColumnReaders(
         ColumnProjection.columns("passenger_count", "trip_distance", "fare_amount"))) {

    ColumnReader col0 = columns.getColumnReader("passenger_count");
    ColumnReader col1 = columns.getColumnReader("trip_distance");
    ColumnReader col2 = columns.getColumnReader("fare_amount");

    long passengerCount = 0;
    double tripDistance = 0, fareAmount = 0;

    while (col0.nextBatch() & col1.nextBatch() & col2.nextBatch()) {
        int count = col0.getRecordCount();
        double[] v0 = col0.getDoubles();
        double[] v1 = col1.getDoubles();
        double[] v2 = col2.getDoubles();

        for (int i = 0; i < count; i++) {
            passengerCount += (long) v0[i];
            tripDistance += v1[i];
            fareAmount += v2[i];
        }
    }
}

Nested and Repeated Columns¶

For nested columns (lists, maps), ColumnReader provides multi-level offsets and per-level null bitmaps:

try (ColumnReader reader = fileReader.createColumnReader("tags")) {
    while (reader.nextBatch()) {
        int recordCount = reader.getRecordCount();
        int valueCount = reader.getValueCount();
        byte[][] values = reader.getBinaries();
        int[] offsets = reader.getOffsets(0);            // record -> value position
        BitSet recordNulls = reader.getLevelNulls(0);    // null list records
        BitSet elementNulls = reader.getElementNulls();  // null elements within lists

        for (int r = 0; r < recordCount; r++) {
            if (recordNulls != null && recordNulls.get(r)) continue;
            int start = offsets[r];
            int end = (r + 1 < recordCount) ? offsets[r + 1] : valueCount;
            for (int i = start; i < end; i++) {
                if (elementNulls == null || !elementNulls.get(i)) {
                    // process values[i]
                }
            }
        }
    }
}

getNestingDepth() returns 0 for flat columns, or the number of offset levels for nested columns.

Accessing File Metadata¶

File metadata can be inspected without reading any row data:

try (ParquetFileReader reader = ParquetFileReader.open(path)) {
    FileMetaData metadata = reader.getFileMetaData();

    System.out.println("Version: " + metadata.version());
    System.out.println("Total rows: " + metadata.numRows());
    System.out.println("Row groups: " + metadata.rowGroups().size());

    FileSchema schema = reader.getFileSchema();
    for (int i = 0; i < schema.getColumnCount(); i++) {
        ColumnSchema column = schema.getColumn(i);
        System.out.print("Column " + i + ": " + column.name() +
                         " (" + column.type() + ", " + column.repetitionType());

        // Display logical type if present
        if (column.logicalType() != null) {
            System.out.print(", " + column.logicalType());
        }
        System.out.println(")");
    }
}