Anonymize data: Difference between revisions

Revision as of 13:11, 14 December 2022

Confidential data is usually handled in process mining, and thus being able to anonymize data is an essential feature. Data can be anonymized using scripts with the following methods:

Within the source system, for example a VIEW in source database
Within the query from the source system, for example, in the SQL SELECT statement
When data is extracted from a source system, the data is immediately anonymized and the anonymized data is stored to datatables.
Inside the SQL Sandbox database
Transformation for data loaded into QPR database: Data in the datatables are analymized and stored to other datatables, where the anonymized data can be exported or visualized in dashboards.
Transformation when model is loaded into memory
- Expression language query
- SQL query
Transformation in a Dashboard query: Individual Dashboard(s) can show the data in anonymized format
Transformation when exporting data from QPR: If the original data is stored in the system, make sure that users who are only allowed to see the anonymized data, don't have access to the original data.

Pseudonymization

Following example script anonymizes selected columns in a datatable and writes the result to a new datatable. Each anonymized data value gets an numeric value starting from one (this can also be called pseudonymization).

let anonymizationMappings = #{};
function PseudonymizeColumn(columnName, originalValue) {
  let dict;
  if (!anonymizationMappings.ContainsKey(columnName)) {
    dict = #{};
    anonymizationMappings.Set(columnName, dict);
  } else {
    dict = anonymizationMappings[columnName];
  }
  if (!dict.ContainsKey(originalValue)) {
    dict.Set(originalValue, `${columnName}: ${dict.Count + 1}`);
  }
  return dict[originalValue];
}

function Pseudonymize(df, cols) {
  for (let i = 0; i < CountTop(cols); ++i) {
    let col = cols[i];
    df = df.SetColumns([
      `${col}`: () => PseudonymizeColumn(col, Column(col))
    ]);
  }
}

let sourceDatatable = DataTableById(1);
Pseudonymize(
  sourceDatatable.SqlDataFrame.Collect(),
  ["Case Id", "Company Code", "Customer Name"]
).Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id})

Shuffling

Following example anonymizes data by shuffling values in each of the selected columns:

function Shuffle(df, cols) {
  for (let i = 0; i < CountTop(cols); ++i) {
    let shuffledData = Shuffle(NumberRange(0, CountTop(df.Rows) - 1));
    let col = cols[i];
    let j = 0;
    df = df.SetColumns([
      `${col}`: () => df.Column(col)[shuffledData[j++]]
    ]);
  }
}

let sourceDatatable = DataTableById(1);
Shuffle(
  sourceDatatable.SqlDataFrame.Collect(),
  ["Case Id", "Company Code", "Customer Name"]
).Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id})

Masking

Data can be masked using the following script. The given number of characters are masked.

function Mask(df, cols, maskCharacters) {
  let mask = StringJoin("", Repeat(maskCharacters, "*"));
  for (let i = 0; i < CountTop(cols); ++i) {
    let shuffledData = Shuffle(NumberRange(0, CountTop(df.Rows) - 1));
    let col = cols[i];
    df = df.SetColumns([
      `${col}`: () => If(Column(col) == null || Column(col).length < mask.length, mask, mask + Column(col).Substring(mask.length))
    ]);
  }
}

let sourceDatatable = DataTableById(1);
Mask(
  sourceDatatable.SqlDataFrame.Collect(),
  ["Case Id", "Company Code", "Customer Name"],
  5
).Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id})

Column removal

Columns containing confidential data can be removed as follows:

let sourceDatatable = DataTableById(1);
sourceDatatable.SqlDataFrame.Collect()
  .RemoveColumns(["Case Id", "Company Code", "Customer Name"])
  .Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id})

Anonymize data: Difference between revisions

Revision as of 13:11, 14 December 2022

Contents

Pseudonymization

Shuffling

Masking

Column removal

Navigation menu

Anonymize data: Difference between revisions

Revision as of 13:11, 14 December 2022

Pseudonymization

Shuffling

Masking

Column removal

Navigation menu

Search