From 766e002f9c43f29556d3bec6d3aab22733ee9ce9 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 21 Apr 2026 18:07:08 +0200 Subject: [PATCH 1/2] Add Parquet optimizations as a fundable project --- .../descriptions/ParquetNullOptimizations.md | 34 +++++++++++++++++++ src/components/fundable/projectsDetails.ts | 17 ++++++++-- .../ParquetNullOptimizations/GetAQuote.tsx | 9 +++++ .../ParquetNullOptimizations/index.tsx | 9 +++++ 4 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 src/components/fundable/descriptions/ParquetNullOptimizations.md create mode 100644 src/pages/fundable/ParquetNullOptimizations/GetAQuote.tsx create mode 100644 src/pages/fundable/ParquetNullOptimizations/index.tsx diff --git a/src/components/fundable/descriptions/ParquetNullOptimizations.md b/src/components/fundable/descriptions/ParquetNullOptimizations.md new file mode 100644 index 00000000..d5494dcb --- /dev/null +++ b/src/components/fundable/descriptions/ParquetNullOptimizations.md @@ -0,0 +1,34 @@ +#### Overview + +Apache Parquet is an open source, column-oriented data file format designed for +efficient data storage and retrieval. Together with Apache Arrow for in-memory data, +it has become for the de facto standard for efficient columnar analytics. + +While Parquet and Arrow are most often used together, they have incompatible physical +representations of data with optional values: data where some values can be +missing or "null". While Arrow uses a validity bitmap for each schema field and nesting level, +Parquet condenses that information in a more sophisticated structure called definition +levels (borrowing ideas from Google's Dremel project). + +Converting between those two representations is non-trivial and often turns out +a performance bottleneck when reading a Parquet file as in-memory Arrow data. +Even columns that practically do not contain any nulls can still suffer from it if +the data is declared nullable (optional) at the schema level. + +We propose to optimize the conversion of null values from Parquet in Arrow C++ +for flat (non-nested) data: + +1. decoding Parquet definition levels directly into a Arrow validity bitmap, rather than using an + intermediate representation as 16-bit integers; + +2. avoiding decoding definition levels entirely when a data page's statistics shows + it cannot contain any nulls (or, conversely, when it cannot contain any non-null values). + +This work can optionally be extended so as to apply to schemas with moderate amounts +of nesting. + +Depending on the typology of Parquet data, this could make Parquet reading 2x +faster, even more in some cases. If you are ensure whether your workload could +benefit, we can discuss this based on sample Parquet files you provide us. + +##### Are you interested in this project? Either entirely or partially, contact us for more information on how to help us fund it diff --git a/src/components/fundable/projectsDetails.ts b/src/components/fundable/projectsDetails.ts index a28e1efd..d35a2d85 100644 --- a/src/components/fundable/projectsDetails.ts +++ b/src/components/fundable/projectsDetails.ts @@ -5,8 +5,9 @@ import EmscriptenForgePackageRequestsMD from "@site/src/components/fundable/desc import SVE2SupportInXsimdMD from "@site/src/components/fundable/descriptions/SVE2SupportInXsimd.md" import MatrixOperationsInXtensorMD from "@site/src/components/fundable/descriptions/MatrixOperationsInXtensor.md" import BinaryViewInArrowCppMD from "@site/src/components/fundable/descriptions/BinaryViewInArrowCpp.md" -import Decimal32InArrowCppMD from"@site/src/components/fundable/descriptions/Decimal32InArrowCpp.md" -import Float16InArrowCppMD from"@site/src/components/fundable/descriptions/Float16InArrowCpp.md" +import Decimal32InArrowCppMD from "@site/src/components/fundable/descriptions/Decimal32InArrowCpp.md" +import Float16InArrowCppMD from "@site/src/components/fundable/descriptions/Float16InArrowCpp.md" +import ParquetNullOptimizationsMD from "@site/src/components/fundable/descriptions/ParquetNullOptimizations.md" export const fundableProjectsDetails = { jupyterEcosystem: [ @@ -125,6 +126,18 @@ export const fundableProjectsDetails = { currentNbOfFunders: 0, currentFundingPercentage: 0, repoLink: "https://github.com/apache/arrow" + }, + { + category: "Apache Arrow and Parquet", + title: "Parquet C++ reader optimizations", + pageName: "ParquetNullOptimizations", + shortDescription: "Converting Parquet optional values to nullable Arrow data is often a performance bottleneck.", + description: ParquetNullOptimizationsMD, + price: "TBD", + maxNbOfFunders: 1, + currentNbOfFunders: 0, + currentFundingPercentage: 0, + repoLink: "https://github.com/apache/arrow" } ] diff --git a/src/pages/fundable/ParquetNullOptimizations/GetAQuote.tsx b/src/pages/fundable/ParquetNullOptimizations/GetAQuote.tsx new file mode 100644 index 00000000..a945b598 --- /dev/null +++ b/src/pages/fundable/ParquetNullOptimizations/GetAQuote.tsx @@ -0,0 +1,9 @@ +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import GetAQuotePage from '@site/src/components/fundable/GetAQuotePage'; + +export default function FundablePage() { + const { siteConfig } = useDocusaurusContext(); + return ( + + ); +} \ No newline at end of file diff --git a/src/pages/fundable/ParquetNullOptimizations/index.tsx b/src/pages/fundable/ParquetNullOptimizations/index.tsx new file mode 100644 index 00000000..876857af --- /dev/null +++ b/src/pages/fundable/ParquetNullOptimizations/index.tsx @@ -0,0 +1,9 @@ +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import LargeProjectCardPage from '@site/src/components/fundable/LargeProjectCardPage'; + +export default function FundablePage() { + const { siteConfig } = useDocusaurusContext(); + return ( + + ); +} From f15e76fce61608f7b8644883f350839dbc366bbd Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 22 Apr 2026 10:10:07 +0200 Subject: [PATCH 2/2] Address review comments --- .../fundable/descriptions/ParquetNullOptimizations.md | 11 +++++++---- src/components/fundable/projectsDetails.ts | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/components/fundable/descriptions/ParquetNullOptimizations.md b/src/components/fundable/descriptions/ParquetNullOptimizations.md index d5494dcb..fb2aed2d 100644 --- a/src/components/fundable/descriptions/ParquetNullOptimizations.md +++ b/src/components/fundable/descriptions/ParquetNullOptimizations.md @@ -2,7 +2,7 @@ Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. Together with Apache Arrow for in-memory data, -it has become for the de facto standard for efficient columnar analytics. +it has become for the *de facto* standard for efficient columnar analytics. While Parquet and Arrow are most often used together, they have incompatible physical representations of data with optional values: data where some values can be @@ -24,11 +24,14 @@ for flat (non-nested) data: 2. avoiding decoding definition levels entirely when a data page's statistics shows it cannot contain any nulls (or, conversely, when it cannot contain any non-null values). -This work can optionally be extended so as to apply to schemas with moderate amounts -of nesting. +As a subsequent task, these optimizations may be extended so as to apply to schemas +with moderate amounts of nesting. + +This work will benefit to applications using Arrow C++ or any of its language +bindings (such as PyArrow, R-Arrow...). Depending on the typology of Parquet data, this could make Parquet reading 2x -faster, even more in some cases. If you are ensure whether your workload could +faster, even more in some cases. If you are unsure whether your workload could benefit, we can discuss this based on sample Parquet files you provide us. ##### Are you interested in this project? Either entirely or partially, contact us for more information on how to help us fund it diff --git a/src/components/fundable/projectsDetails.ts b/src/components/fundable/projectsDetails.ts index d35a2d85..f3d2a4fd 100644 --- a/src/components/fundable/projectsDetails.ts +++ b/src/components/fundable/projectsDetails.ts @@ -129,9 +129,9 @@ export const fundableProjectsDetails = { }, { category: "Apache Arrow and Parquet", - title: "Parquet C++ reader optimizations", + title: "Parquet reader optimizations", pageName: "ParquetNullOptimizations", - shortDescription: "Converting Parquet optional values to nullable Arrow data is often a performance bottleneck.", + shortDescription: "Converting Parquet optional values to nullable Arrow data is often a performance bottleneck. We will optimize that step for the most common cases.", description: ParquetNullOptimizationsMD, price: "TBD", maxNbOfFunders: 1,