Anonymize data: Difference between revisions
No edit summary |
No edit summary |
||
Line 51: | Line 51: | ||
Anonymize(sourceDatatable.SqlDataFrame.Collect(), ["Case Id", "Company Code", "Customer Name"]) | Anonymize(sourceDatatable.SqlDataFrame.Collect(), ["Case Id", "Company Code", "Customer Name"]) | ||
.Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id}) | .Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id}) | ||
</pre> | |||
Data can be masked using the following script: | |||
<pre> | |||
function Anonymize(df, cols, maskCharacters) { | |||
let mask = StringJoin("", Repeat(maskCharacters, "*")); | |||
for (let i = 0; i < CountTop(cols); ++i) { | |||
let shuffledData = Shuffle(NumberRange(0, CountTop(df.Rows) - 1)); | |||
let col = cols[i]; | |||
df = df.SetColumns([ | |||
`${col}`: () => If(Column(col).length < mask.length, mask, mask + Column(col).Substring(mask.length)) | |||
]); | |||
} | |||
} | |||
let sourceDatatable = DataTableById(1); | |||
Anonymize(sourceDatatable.SqlDataFrame.Collect(), ["Case Id", "Company Code", "Customer Name"], 5) | |||
.Persist(sourceDatatable.Name + "_anonymized1", #{"ProjectId": sourceDatatable.Project.Id}) | |||
</pre> | </pre> | ||
Revision as of 10:36, 14 December 2022
Usually confidential data is handled in process mining, and thus being able to anonymize confidential data is an essential feature. Data can be anonymized using scripts with the following methods:
- When data is extracted from a source system, the data is immediately anonymized and the anonymized data is stored to datatables.
- Data in the datatables are analymized and stored to other datatables, where the anonymized data can be exported or visualized in dashboards.
If the original data is stored in the system, make sure that users who are only allowed to see the anonymized data, don't have access to the original data.
Following example script anonymizes selected columns in a datatable and writes the result to a new datatable. Each anonymized data value gets an numeric value starting from one (this can also be called pseudonymization).
let anonymizationDict = #{}; function AnonymizeColumn(columnName, oldValue) { let dict if (!anonymizationDict.ContainsKey(columnName)) { dict = #{}; anonymizationDict.Set(columnName, dict); } else { dict = anonymizationDict[columnName]; } if (!dict.ContainsKey(oldValue)) { dict.Set(oldValue, `${dict.Count + 1}`); } return dict[oldValue]; } function Anonymize(df, cols) { for (let i = 0; i < CountTop(cols); ++i) { let col = cols[i]; df = df.SetColumns([ `${col}`: () => AnonymizeColumn(col, Column(col)) ]); } } let sourceDatatable = DataTableById(1); Anonymize(sourceDatatable.SqlDataFrame.Collect(), ["Case Id", "Company Code", "Customer Name"]) .Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id})
Following example anonymizes data by shuffling values in each of the selected columns:
function Anonymize(df, cols) { for (let i = 0; i < CountTop(cols); ++i) { let shuffledData = Shuffle(NumberRange(0, CountTop(df.Rows) - 1)); let col = cols[i]; let j = 0; df = df.SetColumns([ `${col}`: () => df.Column(col)[shuffledData[j++]] ]); } } let sourceDatatable = DataTableById(1); Anonymize(sourceDatatable.SqlDataFrame.Collect(), ["Case Id", "Company Code", "Customer Name"]) .Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id})
Data can be masked using the following script:
function Anonymize(df, cols, maskCharacters) { let mask = StringJoin("", Repeat(maskCharacters, "*")); for (let i = 0; i < CountTop(cols); ++i) { let shuffledData = Shuffle(NumberRange(0, CountTop(df.Rows) - 1)); let col = cols[i]; df = df.SetColumns([ `${col}`: () => If(Column(col).length < mask.length, mask, mask + Column(col).Substring(mask.length)) ]); } } let sourceDatatable = DataTableById(1); Anonymize(sourceDatatable.SqlDataFrame.Collect(), ["Case Id", "Company Code", "Customer Name"], 5) .Persist(sourceDatatable.Name + "_anonymized1", #{"ProjectId": sourceDatatable.Project.Id})
Columns containing confidential data can be removed as follows:
let sourceDatatable = DataTableById(1); sourceDatatable.SqlDataFrame.Collect() .RemoveColumns(["Case Id", "Company Code", "Customer Name"]) .Persist(sourceDatatable.Name + "_anonymized", #{"ProjectId": sourceDatatable.Project.Id})