Usage¶
Row-Oriented Reading¶
The RowReader provides a convenient row-oriented interface for reading Parquet files with typed accessor methods for type-safe field access.
import dev.hardwood.reader.ParquetFileReader;
import dev.hardwood.reader.RowReader;
import dev.hardwood.row.PqStruct;
import dev.hardwood.row.PqList;
import dev.hardwood.row.PqIntList;
import dev.hardwood.row.PqMap;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalTime;
import java.math.BigDecimal;
import java.util.UUID;
try (ParquetFileReader fileReader = ParquetFileReader.open(path);
RowReader rowReader = fileReader.createRowReader()) {
while (rowReader.hasNext()) {
rowReader.next();
// Access columns by name with typed accessors
long id = rowReader.getLong("id");
String name = rowReader.getString("name");
// Logical types are automatically converted
LocalDate birthDate = rowReader.getDate("birth_date");
Instant createdAt = rowReader.getTimestamp("created_at");
LocalTime wakeTime = rowReader.getTime("wake_time");
BigDecimal balance = rowReader.getDecimal("balance");
UUID accountId = rowReader.getUuid("account_id");
// Check for null values
if (!rowReader.isNull("age")) {
int age = rowReader.getInt("age");
System.out.println("ID: " + id + ", Name: " + name + ", Age: " + age);
}
// Access nested structs
PqStruct address = rowReader.getStruct("address");
if (address != null) {
String city = address.getString("city");
int zip = address.getInt("zip");
}
// Access lists and iterate with typed accessors
PqList tags = rowReader.getList("tags");
if (tags != null) {
for (String tag : tags.strings()) {
System.out.println("Tag: " + tag);
}
}
}
}
Advanced: nested lists, maps, and list-of-structs
// Access list of structs
PqList contacts = rowReader.getList("contacts");
if (contacts != null) {
for (PqStruct contact : contacts.structs()) {
String contactName = contact.getString("name");
String phone = contact.getString("phone");
}
}
// Access nested lists (list<list<int>>) using primitive int lists
PqList matrix = rowReader.getList("matrix");
if (matrix != null) {
for (PqIntList innerList : matrix.intLists()) {
for (var it = innerList.iterator(); it.hasNext(); ) {
int val = it.nextInt();
System.out.println("Value: " + val);
}
}
}
// Access maps (map<string, int>)
PqMap attributes = rowReader.getMap("attributes");
if (attributes != null) {
for (PqMap.Entry entry : attributes.getEntries()) {
String key = entry.getStringKey();
int value = entry.getIntValue();
System.out.println(key + " = " + value);
}
}
// Access maps with struct values (map<string, struct>)
PqMap people = rowReader.getMap("people");
if (people != null) {
for (PqMap.Entry entry : people.getEntries()) {
String personId = entry.getStringKey();
PqStruct person = entry.getStructValue();
String personName = person.getString("name");
int personAge = person.getInt("age");
}
}
Typed Accessor Methods¶
All accessor methods are available in two forms:
- Name-based (e.g.,
getInt("column_name")) — convenient for ad-hoc access - Index-based (e.g.,
getInt(columnIndex)) — faster for performance-critical loops
| Method | Physical/Logical Type | Java Type |
|---|---|---|
getBoolean(name) / getBoolean(index) |
BOOLEAN | boolean |
getInt(name) / getInt(index) |
INT32 | int |
getLong(name) / getLong(index) |
INT64 | long |
getFloat(name) / getFloat(index) |
FLOAT | float |
getDouble(name) / getDouble(index) |
DOUBLE | double |
getBinary(name) / getBinary(index) |
BYTE_ARRAY | byte[] |
getString(name) / getString(index) |
STRING logical type | String |
getDate(name) / getDate(index) |
DATE logical type | LocalDate |
getTime(name) / getTime(index) |
TIME logical type | LocalTime |
getTimestamp(name) / getTimestamp(index) |
TIMESTAMP logical type | Instant |
getDecimal(name) / getDecimal(index) |
DECIMAL logical type | BigDecimal |
getUuid(name) / getUuid(index) |
UUID logical type | UUID |
getStruct(name) |
Nested struct | PqStruct |
getList(name) |
LIST logical type | PqList |
getMap(name) |
MAP logical type | PqMap |
isNull(name) / isNull(index) |
Any | boolean |
Index-based access example:
// Get column indices once (before the loop)
int idIndex = fileReader.getFileSchema().getColumn("id").columnIndex();
int nameIndex = fileReader.getFileSchema().getColumn("name").columnIndex();
while (rowReader.hasNext()) {
rowReader.next();
if (!rowReader.isNull(idIndex)) {
long id = rowReader.getLong(idIndex); // No name lookup per row
String name = rowReader.getString(nameIndex);
}
}
Type validation: The API validates at runtime that the requested type matches the schema. Mismatches throw IllegalArgumentException with a descriptive message.
Column Projection¶
Column projection allows reading only a subset of columns from a Parquet file, improving performance by skipping I/O, decoding, and memory allocation for unneeded columns.
import dev.hardwood.schema.ColumnProjection;
import dev.hardwood.reader.ParquetFileReader;
import dev.hardwood.reader.RowReader;
try (ParquetFileReader fileReader = ParquetFileReader.open(path);
RowReader rowReader = fileReader.createRowReader(
ColumnProjection.columns("id", "name", "created_at"))) {
while (rowReader.hasNext()) {
rowReader.next();
// Access projected columns normally
long id = rowReader.getLong("id");
String name = rowReader.getString("name");
Instant createdAt = rowReader.getTimestamp("created_at");
// Accessing non-projected columns throws IllegalArgumentException
// rowReader.getInt("age"); // throws "Column not in projection: age"
}
}
Projection options:
ColumnProjection.all()— read all columns (default)ColumnProjection.columns("id", "name")— read specific columns by nameColumnProjection.columns("address")— select an entire struct and all its childrenColumnProjection.columns("address.city")— select a specific nested field (dot notation)
Reading Multiple Files¶
When processing multiple Parquet files, use the Hardwood class to share a thread pool across readers.
Unified Multi-File Reader¶
For reading multiple files as a single logical dataset, use openAll() which returns a MultiFileParquetReader. From there, create a row reader or column readers:
import dev.hardwood.Hardwood;
import dev.hardwood.reader.MultiFileParquetReader;
import dev.hardwood.reader.MultiFileRowReader;
List<Path> files = List.of(
Path.of("data_2024_01.parquet"),
Path.of("data_2024_02.parquet"),
Path.of("data_2024_03.parquet")
);
try (Hardwood hardwood = Hardwood.create();
MultiFileParquetReader parquet = hardwood.openAll(files);
MultiFileRowReader reader = parquet.createRowReader()) {
while (reader.hasNext()) {
reader.next();
// Access data using the same API as single-file RowReader
long id = reader.getLong("id");
String name = reader.getString("name");
}
}
The MultiFileRowReader provides cross-file prefetching: when pages from file N are running low, pages from file N+1 are already being prefetched. This eliminates I/O stalls at file boundaries.
With column projection:
try (Hardwood hardwood = Hardwood.create();
MultiFileParquetReader parquet = hardwood.openAll(files);
MultiFileRowReader reader = parquet.createRowReader(
ColumnProjection.columns("id", "name", "amount"))) {
while (reader.hasNext()) {
reader.next();
// Only projected columns are read
}
}
Column-Oriented Reading (ColumnReader)¶
The ColumnReader provides batch-oriented columnar access with typed primitive arrays, avoiding per-row method calls and boxing. This is the fastest way to consume Parquet data when you process columns independently.
Single-File Column Reading¶
import dev.hardwood.reader.ParquetFileReader;
import dev.hardwood.reader.ColumnReader;
try (ParquetFileReader reader = ParquetFileReader.open(path)) {
// Create a column reader by name (spans all row groups automatically)
try (ColumnReader fare = reader.createColumnReader("fare_amount")) {
double sum = 0;
while (fare.nextBatch()) {
int count = fare.getRecordCount();
double[] values = fare.getDoubles();
BitSet nulls = fare.getElementNulls(); // null if column is required
for (int i = 0; i < count; i++) {
if (nulls == null || !nulls.get(i)) {
sum += values[i];
}
}
}
}
}
Typed accessors are available for each physical type: getInts(), getLongs(), getFloats(), getDoubles(), getBooleans(), getBinaries(), and getStrings(). Column readers can also be created by index via createColumnReader(int columnIndex).
Multi-File Column Reading¶
For reading columns across multiple files with cross-file prefetching, use MultiFileColumnReaders:
import dev.hardwood.Hardwood;
import dev.hardwood.reader.MultiFileParquetReader;
import dev.hardwood.reader.MultiFileColumnReaders;
import dev.hardwood.reader.ColumnReader;
import dev.hardwood.schema.ColumnProjection;
try (Hardwood hardwood = Hardwood.create();
MultiFileParquetReader parquet = hardwood.openAll(files);
MultiFileColumnReaders columns = parquet.createColumnReaders(
ColumnProjection.columns("passenger_count", "trip_distance", "fare_amount"))) {
ColumnReader col0 = columns.getColumnReader("passenger_count");
ColumnReader col1 = columns.getColumnReader("trip_distance");
ColumnReader col2 = columns.getColumnReader("fare_amount");
long passengerCount = 0;
double tripDistance = 0, fareAmount = 0;
while (col0.nextBatch() & col1.nextBatch() & col2.nextBatch()) {
int count = col0.getRecordCount();
double[] v0 = col0.getDoubles();
double[] v1 = col1.getDoubles();
double[] v2 = col2.getDoubles();
for (int i = 0; i < count; i++) {
passengerCount += (long) v0[i];
tripDistance += v1[i];
fareAmount += v2[i];
}
}
}
Nested and Repeated Columns¶
For nested columns (lists, maps), ColumnReader provides multi-level offsets and per-level null bitmaps:
try (ColumnReader reader = fileReader.createColumnReader("tags")) {
while (reader.nextBatch()) {
int recordCount = reader.getRecordCount();
int valueCount = reader.getValueCount();
byte[][] values = reader.getBinaries();
int[] offsets = reader.getOffsets(0); // record -> value position
BitSet recordNulls = reader.getLevelNulls(0); // null list records
BitSet elementNulls = reader.getElementNulls(); // null elements within lists
for (int r = 0; r < recordCount; r++) {
if (recordNulls != null && recordNulls.get(r)) continue;
int start = offsets[r];
int end = (r + 1 < recordCount) ? offsets[r + 1] : valueCount;
for (int i = start; i < end; i++) {
if (elementNulls == null || !elementNulls.get(i)) {
// process values[i]
}
}
}
}
}
getNestingDepth() returns 0 for flat columns, or the number of offset levels for nested columns.
Accessing File Metadata¶
File metadata can be inspected without reading any row data:
try (ParquetFileReader reader = ParquetFileReader.open(path)) {
FileMetaData metadata = reader.getFileMetaData();
System.out.println("Version: " + metadata.version());
System.out.println("Total rows: " + metadata.numRows());
System.out.println("Row groups: " + metadata.rowGroups().size());
FileSchema schema = reader.getFileSchema();
for (int i = 0; i < schema.getColumnCount(); i++) {
ColumnSchema column = schema.getColumn(i);
System.out.print("Column " + i + ": " + column.name() +
" (" + column.type() + ", " + column.repetitionType());
// Display logical type if present
if (column.logicalType() != null) {
System.out.print(", " + column.logicalType());
}
System.out.println(")");
}
}