rust_data_processing/lib.rs
1//! `rust-data-processing` is a small library for ingesting common file formats into an in-memory
2//! [`types::DataSet`], using a user-provided [`types::Schema`].
3//!
4//! The primary entrypoint is [`ingestion::ingest_from_path`], which can auto-detect the ingestion
5//! format from the file extension (or you can force a format via [`ingestion::IngestionOptions`]).
6//!
7//! ## What you can ingest (Epic 1 / Story 1.1)
8//!
9//! **File formats (auto-detected by extension):**
10//!
11//! - **CSV**: `.csv`
12//! - **JSON**: `.json` (array-of-objects) and `.ndjson` (newline-delimited objects)
13//! - **Parquet**: `.parquet`, `.pq`
14//! - **Excel/workbooks**: `.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.ods`
15//!
16//! **Schema + value types:**
17//!
18//! Ingestion produces a [`types::DataSet`] whose cells are typed [`types::Value`]s matching a
19//! user-provided [`types::Schema`]. Supported logical types are:
20//!
21//! - [`types::DataType::Int64`]
22//! - [`types::DataType::Float64`]
23//! - [`types::DataType::Bool`]
24//! - [`types::DataType::Utf8`]
25//!
26//! Across formats, empty cells / empty strings / explicit JSON `null` map to [`types::Value::Null`].
27//!
28//! ## Quick examples: ingest data
29//!
30//! ```no_run
31//! use rust_data_processing::ingestion::{ingest_from_path, IngestionOptions};
32//! use rust_data_processing::types::{DataType, Field, Schema};
33//!
34//! # fn main() -> Result<(), rust_data_processing::IngestionError> {
35//! let schema = Schema::new(vec![
36//! Field::new("id", DataType::Int64),
37//! Field::new("name", DataType::Utf8),
38//! ]);
39//! // Auto-detects by extension (.csv/.json/.parquet/.xlsx/...).
40//! let ds = ingest_from_path("data.csv", &schema, &IngestionOptions::default())?;
41//! println!("rows={}", ds.row_count());
42//! # Ok(())
43//! # }
44//! ```
45//!
46//! JSON supports nested field paths using dot notation in the schema (e.g. `user.name`):
47//!
48//! ```no_run
49//! use rust_data_processing::ingestion::{ingest_from_path, IngestionOptions};
50//! use rust_data_processing::types::{DataType, Field, Schema};
51//!
52//! # fn main() -> Result<(), rust_data_processing::IngestionError> {
53//! let schema = Schema::new(vec![
54//! Field::new("id", DataType::Int64),
55//! Field::new("user.name", DataType::Utf8),
56//! ]);
57//! let ds = ingest_from_path("events.ndjson", &schema, &IngestionOptions::default())?;
58//! println!("rows={}", ds.row_count());
59//! # Ok(())
60//! # }
61//! ```
62//!
63//! ## Modules
64//!
65//! - [`ingestion`]: unified ingestion entrypoints and format-specific implementations
66//! - [`types`]: schema + in-memory dataset types
67//! - [`export`]: JSON Lines export + deterministic train/test row indices
68//! - [`privacy`]: UTF-8 change summaries after masking transforms (reports only)
69//! - [`reports`]: deterministic truncation helpers for large JSON/text blobs
70//! - [`processing`]: in-memory dataset transformations (filter/map/reduce, feature-wise stats, arg max/min, top‑k frequency)
71//! - [`execution`]: execution engine for parallel pipelines + throttling + metrics
72//! - `sql`: SQL support (Polars-backed; enabled by default)
73//! - [`transform`]: serde-friendly transformation spec compiled to pipeline wrappers
74//! - [`profiling`]: Polars-backed profiling metrics + sampling modes
75//! - [`validation`]: validation DSL + built-in checks + reporting
76//! - [`outliers`]: outlier detection primitives + explainable outputs
77//! - [`cdc`]: CDC boundary types (Phase 1 spike)
78//! - [`error`]: error types used across ingestion
79//!
80//! ## Processing example (1.2 pipeline)
81//!
82//! ```rust
83//! use rust_data_processing::processing::{filter, map, reduce, ReduceOp};
84//! use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};
85//!
86//! let schema = Schema::new(vec![
87//! Field::new("id", DataType::Int64),
88//! Field::new("active", DataType::Bool),
89//! Field::new("score", DataType::Float64),
90//! ]);
91//! let ds = DataSet::new(
92//! schema,
93//! vec![
94//! vec![Value::Int64(1), Value::Bool(true), Value::Float64(10.0)],
95//! vec![Value::Int64(2), Value::Bool(false), Value::Float64(20.0)],
96//! vec![Value::Int64(3), Value::Bool(true), Value::Null],
97//! ],
98//! );
99//!
100//! let active_idx = ds.schema.index_of("active").unwrap();
101//! let filtered = filter(&ds, |row| matches!(row.get(active_idx), Some(Value::Bool(true))));
102//! let mapped = map(&filtered, |row| {
103//! let mut out = row.to_vec();
104//! // score *= 2.0
105//! if let Some(Value::Float64(v)) = out.get(2) {
106//! out[2] = Value::Float64(v * 2.0);
107//! }
108//! out
109//! });
110//!
111//! let sum = reduce(&mapped, "score", ReduceOp::Sum).unwrap();
112//! assert_eq!(sum, Value::Float64(20.0));
113//! ```
114//!
115//! ## Execution engine example (1.3 parallel pipeline)
116//!
117//! ```no_run
118//! use rust_data_processing::execution::{ExecutionEngine, ExecutionOptions};
119//! use rust_data_processing::processing::ReduceOp;
120//! use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};
121//!
122//! # fn main() {
123//! let schema = Schema::new(vec![
124//! Field::new("id", DataType::Int64),
125//! Field::new("active", DataType::Bool),
126//! Field::new("score", DataType::Float64),
127//! ]);
128//! let ds = DataSet::new(
129//! schema,
130//! vec![
131//! vec![Value::Int64(1), Value::Bool(true), Value::Float64(10.0)],
132//! vec![Value::Int64(2), Value::Bool(false), Value::Float64(20.0)],
133//! vec![Value::Int64(3), Value::Bool(true), Value::Null],
134//! ],
135//! );
136//!
137//! let engine = ExecutionEngine::new(ExecutionOptions {
138//! num_threads: Some(4),
139//! chunk_size: 1_024,
140//! max_in_flight_chunks: 4,
141//! });
142//!
143//! let active_idx = ds.schema.index_of("active").unwrap();
144//! let filtered = engine.filter_parallel(&ds, |row| matches!(row.get(active_idx), Some(Value::Bool(true))));
145//! let mapped = engine.map_parallel(&filtered, |row| row.to_vec());
146//! let sum = engine.reduce(&mapped, "score", ReduceOp::Sum).unwrap();
147//! assert_eq!(sum, Value::Float64(30.0));
148//!
149//! let snapshot = engine.metrics().snapshot();
150//! println!("rows_processed={}", snapshot.rows_processed);
151//! # }
152//! ```
153//!
154//! ## Quick examples: Phase 1 modules
155//!
156//! ### TransformSpec (declarative ETL)
157//!
158//! ```rust
159//! use rust_data_processing::pipeline::CastMode;
160//! use rust_data_processing::transform::{TransformSpec, TransformStep};
161//! use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};
162//!
163//! let ds = DataSet::new(
164//! Schema::new(vec![
165//! Field::new("id", DataType::Int64),
166//! Field::new("score", DataType::Int64),
167//! ]),
168//! vec![vec![Value::Int64(1), Value::Int64(10)], vec![Value::Int64(2), Value::Null]],
169//! );
170//!
171//! let out_schema = Schema::new(vec![
172//! Field::new("id", DataType::Int64),
173//! Field::new("score_f", DataType::Float64),
174//! ]);
175//!
176//! let spec = TransformSpec::new(out_schema.clone())
177//! .with_step(TransformStep::Rename { pairs: vec![("score".to_string(), "score_f".to_string())] })
178//! .with_step(TransformStep::Cast { column: "score_f".to_string(), to: DataType::Float64, mode: CastMode::Lossy })
179//! .with_step(TransformStep::FillNull { column: "score_f".to_string(), value: Value::Float64(0.0) });
180//!
181//! let out = spec.apply(&ds).unwrap();
182//! assert_eq!(out.schema, out_schema);
183//! ```
184//!
185//! ### Profiling (metrics + deterministic sampling)
186//!
187//! ```rust
188//! use rust_data_processing::profiling::{profile_dataset, ProfileOptions, SamplingMode};
189//! use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};
190//!
191//! let ds = DataSet::new(
192//! Schema::new(vec![Field::new("x", DataType::Float64)]),
193//! vec![vec![Value::Float64(1.0)], vec![Value::Null], vec![Value::Float64(3.0)]],
194//! );
195//!
196//! let rep = profile_dataset(
197//! &ds,
198//! &ProfileOptions { sampling: SamplingMode::Head(2), quantiles: vec![0.5] },
199//! )
200//! .unwrap();
201//! assert_eq!(rep.row_count, 2);
202//! ```
203//!
204//! ### Validation (DSL + reporting)
205//!
206//! ```rust
207//! use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};
208//! use rust_data_processing::validation::{validate_dataset, Check, Severity, ValidationSpec};
209//!
210//! let ds = DataSet::new(
211//! Schema::new(vec![Field::new("email", DataType::Utf8)]),
212//! vec![vec![Value::Utf8("ada@example.com".to_string())], vec![Value::Null]],
213//! );
214//!
215//! let spec = ValidationSpec::new(vec![
216//! Check::NotNull { column: "email".to_string(), severity: Severity::Error },
217//! ]);
218//!
219//! let rep = validate_dataset(&ds, &spec).unwrap();
220//! assert_eq!(rep.summary.total_checks, 1);
221//! ```
222//!
223//! ### Outliers (IQR / z-score / MAD)
224//!
225//! ```rust
226//! use rust_data_processing::outliers::{detect_outliers_dataset, OutlierMethod, OutlierOptions};
227//! use rust_data_processing::profiling::SamplingMode;
228//! use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};
229//!
230//! let ds = DataSet::new(
231//! Schema::new(vec![Field::new("x", DataType::Float64)]),
232//! vec![
233//! vec![Value::Float64(1.0)],
234//! vec![Value::Float64(1.0)],
235//! vec![Value::Float64(1.0)],
236//! vec![Value::Float64(1000.0)],
237//! ],
238//! );
239//!
240//! let rep = detect_outliers_dataset(
241//! &ds,
242//! "x",
243//! OutlierMethod::Iqr { k: 1.5 },
244//! &OutlierOptions { sampling: SamplingMode::Full, max_examples: 3 },
245//! )
246//! .unwrap();
247//! assert!(rep.outlier_count >= 1);
248//! ```
249//!
250//! ### SQL (Polars-backed)
251//!
252//! ```no_run
253//! # #[cfg(feature = "sql")]
254//! # fn main() -> Result<(), rust_data_processing::IngestionError> {
255//! use rust_data_processing::pipeline::DataFrame;
256//! use rust_data_processing::sql;
257//! use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};
258//!
259//! let ds = DataSet::new(
260//! Schema::new(vec![Field::new("id", DataType::Int64), Field::new("active", DataType::Bool)]),
261//! vec![vec![Value::Int64(1), Value::Bool(true)]],
262//! );
263//! let out = sql::query(&DataFrame::from_dataset(&ds)?, "SELECT id FROM df WHERE active = TRUE")?
264//! .collect()?;
265//! assert_eq!(out.row_count(), 1);
266//! # Ok(())
267//! # }
268//! # #[cfg(not(feature = "sql"))]
269//! # fn main() {}
270//! ```
271//!
272//! For more end-to-end examples, see the repository `README.md` and `API.md` (processing / aggregates).
273//! Aggregate semantics: `docs/REDUCE_AGG_SEMANTICS.md`.
274//!
275//! ### Reduce operations
276//!
277//! - [`processing::ReduceOp::Count`]: counts rows (including nulls)
278//! - [`processing::ReduceOp::Sum`], [`processing::ReduceOp::Min`], [`processing::ReduceOp::Max`]:
279//! operate on numeric columns and ignore nulls. If all values are null, these return
280//! `Some(Value::Null)`.
281//! - [`processing::ReduceOp::Mean`], [`processing::ReduceOp::Variance`], [`processing::ReduceOp::StdDev`]:
282//! use a numerically stable one-pass (Welford) accumulation; mean / sum-of-squares / L2 norm are
283//! returned as [`types::Value::Float64`]. Sample variance / std dev require at least two values.
284//! - [`processing::ReduceOp::CountDistinctNonNull`]: distinct non-null values (also for UTF-8 and bool).
285//! - [`processing::ReduceOp::Median`]: median of numeric values as [`types::Value::Float64`] (even length: mean of the two middle values).
286//! - [`pipeline::DataFrame::reduce`] provides the Polars-backed equivalent for whole-frame scalars.
287//! - [`processing::feature_wise_mean_std`]: one scan, mean + std for several numeric columns; [`pipeline::DataFrame::feature_wise_mean_std`] for Polars.
288//! - [`processing::arg_max_row`], [`processing::arg_min_row`], [`processing::top_k_by_frequency`]: row extrema and label top‑k.
289
290pub mod cdc;
291pub mod error;
292pub mod execution;
293pub mod export;
294pub mod ingestion;
295pub mod outliers;
296pub mod pipeline;
297pub mod privacy;
298pub mod processing;
299pub mod profiling;
300pub mod reports;
301#[cfg(feature = "sql")]
302pub mod sql;
303pub mod transform;
304pub mod types;
305pub mod validation;
306
307pub use error::{IngestionError, IngestionResult};