Validate Relational Datamodel

From QPR ProcessAnalyzer Wiki
Jump to navigation Jump to search

This page features a script designed to validate a relational model stored in QPR ProcessAnalyzer datatables. The script checks all datatables in the project where the script is run.

The validation includes the following checks: - Primary key columns must not contain null values. - Each row associated with the primary key columns must be unique. - For foreign keys, the target table must have a corresponding row that matches the reference from the source table's foreign key.

Create a new expression script to the project and run the script. When the script run is completed, open the script run log. There is either is message that the data model is valid, or there is a list of found validation errors.

let MAX_ROWS_TO_REPORT = 100;

function presentDataset(dataset) {
  return dataset.Rows.{
    let row = _;
    if (Count(dataset.Rows[0]) == 1) {
      row[0];
    } else {
      ToJson(row);
    }
  };
}

function getNullValueCountInColumn(datatable, columnName) {
  let nullRowCountData = datatable
    .SqlDataFrame
    .Where(Column(#expr{ columnName }) == null)
    .GroupBy([])
    .Aggregate(["count"], ["count"])
    .Select(["count"])
    .Collect();
  return nullRowCountData.Rows[0][0];
}

function getDuplicateRowsByColumns(datatable, columns) {
  if (Count(columns) == 0) {
    return [];
  }

  let duplicatePrimaryKeys = datatable
    .SqlDataFrame
    .GroupBy(columns)
    .Aggregate(["DuplicateCount"], ["count"])
    .Where(Column("DuplicateCount") > 1)
    .OrderByColumns(["DuplicateCount"], ["False"])
    .Head(MAX_ROWS_TO_REPORT)
    .Select(columns)
    .Collect();
  return presentDataset(duplicatePrimaryKeys);
}

function getInvalidRowsForForeignKey(datatable, foreignKey) {
  let TARGET_COLUMN_PREFIX = "Target";
  let foreignKeyColumns = foreignKey.Columns;
  let targetDataFrame = Project.DatatableByName(foreignKey.TargetDatatable)
    .SqlDataFrame
    .Select(foreignKeyColumns.{ let col = _;[(TARGET_COLUMN_PREFIX + col.Target): col.Target] }[0]);
  let invalidRows = datatable.SqlDataFrame
    .Where(#expr{ ToSqlExpression(StringJoin(" || ", foreignKeyColumns.Source.("Column(\"" + _ + "\") != null")))})
    .Join(targetDataFrame, foreignKeyColumns.{ let col = _;[col.Source: (TARGET_COLUMN_PREFIX + col.Target)] }[0], "leftouter")
    .Where(#expr{ ToSqlExpression(StringJoin(" || ", foreignKeyColumns.Target.("Column(\"" + TARGET_COLUMN_PREFIX + _ + "\") == null")))})
    .Head(MAX_ROWS_TO_REPORT)
    .Select(foreignKeyColumns.Source)
    .Collect();
  return presentDataset(invalidRows);
}

let errorsOccurred = false;

Project.Datatables.{
  let datatable = _;

  let primaryKeyColumns = datatable.Columns.Where(PrimaryKey).Name;
  for (let index = 0; index < Count(primaryKeyColumns); index++) {
    let nullRowCount = getNullValueCountInColumn(datatable, primaryKeyColumns[index]);
    if (nullRowCount > 0) {
      errorsOccurred = true;
      WriteLog("Datatable \"" + datatable.Name + "\" has " + nullRowCount + " rows with null values in primary key column \"" + primaryKeyColumns[index] + "\".");
    }
  }

  let duplicatePrimaryKeys = getDuplicateRowsByColumns(datatable, primaryKeyColumns);
  if (Count(duplicatePrimaryKeys) > 0) {
    errorsOccurred = true;
    WriteLog("Datatable \"" + datatable.Name + "\" has following duplicate rows for the primary key: " + StringJoin(", ", duplicatePrimaryKeys) + (Count(duplicatePrimaryKeys) >= MAX_ROWS_TO_REPORT ? " (+ more)": ""));
  }

  let foreignKeys = datatable.ForeignKeys;
  for (let index = 0; index < Count(foreignKeys); index++) {
    let invalidForeignKeyRows = getInvalidRowsForForeignKey(datatable, foreignKeys[index]);
    if (Count(invalidForeignKeyRows) > 0) {
      errorsOccurred = true;
      WriteLog("Datatable \"" + datatable.Name + "\" has in the foreign key " + ToJson(foreignKeys[index]) + " following invalid rows: " + StringJoin(", ", invalidForeignKeyRows) + (Count(invalidForeignKeyRows) >= MAX_ROWS_TO_REPORT ? " (+ more)": ""));
    }
  }
}

if (!errorsOccurred) {
  WriteLog("Data model is valid.");
}