Skip to main content

rust_data_processing/
privacy.rs

1//! Summaries for **UTF-8 cell changes** after caller-defined transforms (Phase 2).
2//!
3//! This crate does **not** provide legal classification of data; callers supply policy and
4//! interpret reports.
5
6use serde::Serialize;
7
8use crate::types::{DataSet, Value};
9
10/// Per-column summary comparing string cells before and after a transform.
11#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
12pub struct Utf8ColumnChangeSummary {
13    pub column: String,
14    pub non_null_cells_compared: usize,
15    pub cells_changed: usize,
16}
17
18/// Count how many non-null UTF-8 cells differ between `before` and `after` for each column name.
19pub fn summarize_utf8_column_changes(
20    before: &DataSet,
21    after: &DataSet,
22    columns: &[String],
23) -> Vec<Utf8ColumnChangeSummary> {
24    let mut out = Vec::with_capacity(columns.len());
25    for name in columns {
26        let Some(bi) = before.schema.index_of(name) else {
27            continue;
28        };
29        let Some(ai) = after.schema.index_of(name) else {
30            continue;
31        };
32        let mut non_null = 0usize;
33        let mut changed = 0usize;
34        for (br, ar) in before.rows.iter().zip(after.rows.iter()) {
35            if let (Some(Value::Utf8(sb)), Some(Value::Utf8(sa))) = (br.get(bi), ar.get(ai)) {
36                non_null += 1;
37                if sb != sa {
38                    changed += 1;
39                }
40            }
41        }
42        out.push(Utf8ColumnChangeSummary {
43            column: name.clone(),
44            non_null_cells_compared: non_null,
45            cells_changed: changed,
46        });
47    }
48    out
49}
50
51/// Pretty JSON for [`summarize_utf8_column_changes`].
52pub fn render_privacy_report_json(
53    rows: &[Utf8ColumnChangeSummary],
54) -> crate::error::IngestionResult<String> {
55    serde_json::to_string_pretty(rows).map_err(|e| crate::error::IngestionError::SchemaMismatch {
56        message: format!("privacy report json: {e}"),
57    })
58}
59
60/// Short Markdown list for human review.
61pub fn render_privacy_report_markdown(rows: &[Utf8ColumnChangeSummary]) -> String {
62    let mut s = String::from("## Privacy / masking summary (UTF-8 diffs)\n\n");
63    for r in rows {
64        s.push_str(&format!(
65            "- **{}**: changed **{}** / **{}** non-null cells\n",
66            r.column, r.cells_changed, r.non_null_cells_compared
67        ));
68    }
69    s
70}
71
72#[cfg(test)]
73mod tests {
74    use super::*;
75    use crate::types::{DataType, Field, Schema};
76
77    #[test]
78    fn summary_counts_changes() {
79        let sc = Schema::new(vec![Field::new("email", DataType::Utf8)]);
80        let before = DataSet::new(
81            sc.clone(),
82            vec![vec![Value::Utf8("a@b.c".into())], vec![Value::Null]],
83        );
84        let after = DataSet::new(
85            sc,
86            vec![vec![Value::Utf8("a@***".into())], vec![Value::Null]],
87        );
88        let r = summarize_utf8_column_changes(&before, &after, &["email".into()]);
89        assert_eq!(r[0].cells_changed, 1);
90        assert_eq!(r[0].non_null_cells_compared, 1);
91    }
92}