diff --git a/src/components/fundable/descriptions/ParquetNullOptimizations.md b/src/components/fundable/descriptions/ParquetNullOptimizations.md new file mode 100644 index 00000000..fb2aed2d --- /dev/null +++ b/src/components/fundable/descriptions/ParquetNullOptimizations.md @@ -0,0 +1,37 @@ +#### Overview + +Apache Parquet is an open source, column-oriented data file format designed for +efficient data storage and retrieval. Together with Apache Arrow for in-memory data, +it has become for the *de facto* standard for efficient columnar analytics. + +While Parquet and Arrow are most often used together, they have incompatible physical +representations of data with optional values: data where some values can be +missing or "null". While Arrow uses a validity bitmap for each schema field and nesting level, +Parquet condenses that information in a more sophisticated structure called definition +levels (borrowing ideas from Google's Dremel project). + +Converting between those two representations is non-trivial and often turns out +a performance bottleneck when reading a Parquet file as in-memory Arrow data. +Even columns that practically do not contain any nulls can still suffer from it if +the data is declared nullable (optional) at the schema level. + +We propose to optimize the conversion of null values from Parquet in Arrow C++ +for flat (non-nested) data: + +1. decoding Parquet definition levels directly into a Arrow validity bitmap, rather than using an + intermediate representation as 16-bit integers; + +2. avoiding decoding definition levels entirely when a data page's statistics shows + it cannot contain any nulls (or, conversely, when it cannot contain any non-null values). + +As a subsequent task, these optimizations may be extended so as to apply to schemas +with moderate amounts of nesting. + +This work will benefit to applications using Arrow C++ or any of its language +bindings (such as PyArrow, R-Arrow...). + +Depending on the typology of Parquet data, this could make Parquet reading 2x +faster, even more in some cases. If you are unsure whether your workload could +benefit, we can discuss this based on sample Parquet files you provide us. + +##### Are you interested in this project? Either entirely or partially, contact us for more information on how to help us fund it diff --git a/src/components/fundable/projectsDetails.ts b/src/components/fundable/projectsDetails.ts index a28e1efd..f3d2a4fd 100644 --- a/src/components/fundable/projectsDetails.ts +++ b/src/components/fundable/projectsDetails.ts @@ -5,8 +5,9 @@ import EmscriptenForgePackageRequestsMD from "@site/src/components/fundable/desc import SVE2SupportInXsimdMD from "@site/src/components/fundable/descriptions/SVE2SupportInXsimd.md" import MatrixOperationsInXtensorMD from "@site/src/components/fundable/descriptions/MatrixOperationsInXtensor.md" import BinaryViewInArrowCppMD from "@site/src/components/fundable/descriptions/BinaryViewInArrowCpp.md" -import Decimal32InArrowCppMD from"@site/src/components/fundable/descriptions/Decimal32InArrowCpp.md" -import Float16InArrowCppMD from"@site/src/components/fundable/descriptions/Float16InArrowCpp.md" +import Decimal32InArrowCppMD from "@site/src/components/fundable/descriptions/Decimal32InArrowCpp.md" +import Float16InArrowCppMD from "@site/src/components/fundable/descriptions/Float16InArrowCpp.md" +import ParquetNullOptimizationsMD from "@site/src/components/fundable/descriptions/ParquetNullOptimizations.md" export const fundableProjectsDetails = { jupyterEcosystem: [ @@ -125,6 +126,18 @@ export const fundableProjectsDetails = { currentNbOfFunders: 0, currentFundingPercentage: 0, repoLink: "https://github.com/apache/arrow" + }, + { + category: "Apache Arrow and Parquet", + title: "Parquet reader optimizations", + pageName: "ParquetNullOptimizations", + shortDescription: "Converting Parquet optional values to nullable Arrow data is often a performance bottleneck. We will optimize that step for the most common cases.", + description: ParquetNullOptimizationsMD, + price: "TBD", + maxNbOfFunders: 1, + currentNbOfFunders: 0, + currentFundingPercentage: 0, + repoLink: "https://github.com/apache/arrow" } ] diff --git a/src/pages/fundable/ParquetNullOptimizations/GetAQuote.tsx b/src/pages/fundable/ParquetNullOptimizations/GetAQuote.tsx new file mode 100644 index 00000000..a945b598 --- /dev/null +++ b/src/pages/fundable/ParquetNullOptimizations/GetAQuote.tsx @@ -0,0 +1,9 @@ +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import GetAQuotePage from '@site/src/components/fundable/GetAQuotePage'; + +export default function FundablePage() { + const { siteConfig } = useDocusaurusContext(); + return ( + + ); +} \ No newline at end of file diff --git a/src/pages/fundable/ParquetNullOptimizations/index.tsx b/src/pages/fundable/ParquetNullOptimizations/index.tsx new file mode 100644 index 00000000..876857af --- /dev/null +++ b/src/pages/fundable/ParquetNullOptimizations/index.tsx @@ -0,0 +1,9 @@ +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import LargeProjectCardPage from '@site/src/components/fundable/LargeProjectCardPage'; + +export default function FundablePage() { + const { siteConfig } = useDocusaurusContext(); + return ( + + ); +}