Expression Script Examples: Difference between revisions

From QPR ProcessAnalyzer Wiki
Jump to navigation Jump to search
(Added an example on how to use SqlDataFrames to filter rows based on how often column values occur in the whole dataset.)
Line 192: Line 192:
   .OrderByColumns(["Month"], [true])
   .OrderByColumns(["Month"], [true])
   .Collect().ToCsv();
   .Collect().ToCsv();
</syntaxhighlight>
=== Function for filtering SqlDataFrame by removing rows having, or replacing, the most infrequently occurring column values ===
<syntaxhighlight lang="typescript" line="1">
/***
* @name ColumnWithMinUsage
* @descripion
* Generic function that can be used to filter out the most infrequently occurring attribute values or replace their Values
* with given common value.
* @param df:
* DataFrame to operate on.
* @param columnName:
* Name of the column to be filtered.
* @param newColumnName:
* Name of the column that will contain the new value of the original column after filtering (if includeOthers was applied).
* @param maxNumUniqueValues:
* Maximum number of unique values to include into the comparison for each attribute column. If the amount of unique values for any attribute exceeds this value, only given number of attributes are included that have the highest usage.
* @param minValueUsage:
* Minimum total usage of a value included into the comparison. The number of cases having every returned value should be at least given percentage (a float value between 0.0 and 1.0) of all the compared cases.
* @param includeOthers:
* Should the rest of the attribute values not included due to MinValueUsage or MaxNumUniqueValues filtering be included as an aggregated "Others" value?
* If not empty/null, defines the name used for these other-values.
*/
function ColumnWithMinUsage(df, columnName, newColumnName, maxNumUniqueValues, minValueUsage, includeOthers)
{
  let all = df
.GroupBy([])
.Aggregate(["NAllTotal"], ["Count"])
.WithColumn("__Join2", #sql{1});
  let minValueUsageEnabled = !IsNullTop(minValueUsage);
  let maxNumUniqueValuesEnabled = !IsNullTop(maxNumUniqueValues);
  if (minValueUsageEnabled || maxNumUniqueValuesEnabled) {
// Perform column value-based filtering if minValueUsageEnabled or maxNumUniqueValuesEnabled is defined.
    let valueColumnName = "__ValueNew";
let filteredValuesColumns = [valueColumnName: columnName];
let filteredValues = df
  .GroupBy([columnName]).Aggregate(["Count"], ["Count"]);
if (minValueUsageEnabled) {
  filteredValues = filteredValues
.WithColumn("__Join", #sql{1})
.Join(all, ["__Join": "__Join2"], "leftouter")
        .WithColumn("Usage", #sql{Column("Count") / Column("NAllTotal")});
  filteredValuesColumns = Concat(filteredValuesColumns, ["Usage"]);
}
if (maxNumUniqueValuesEnabled) {
  filteredValues = filteredValues
.WithRowNumberColumn("RowNumber", ["Count"], null, [false]);
  filteredValuesColumns = Concat(filteredValuesColumns, ["RowNumber"]);
}
filteredValues = filteredValues
  .Select(filteredValuesColumns);
// Generate select returning all the accepted values.
let allValues = filteredValues
  .(minValueUsageEnabled ? Where(#sql{Column("Usage") >= Variable("minValueUsage")}) : _)
  .(maxNumUniqueValuesEnabled ? Where(#sql{Column("RowNumber") <= Variable("maxNumUniqueValues")}) : _)
  .Select([valueColumnName, newColumnName: valueColumnName]);
if (!IsNullTop(includeOthers)) {
  // If includeOthers is defined, replace original values with the variable defined in includeOthers.
  let otherValues = filteredValues
.(minValueUsageEnabled ? Where(#sql{Column("Usage") < Variable("minValueUsage")}) : _)
.(maxNumUniqueValuesEnabled ? Where(#sql{Column("RowNumber") > Variable("maxNumUniqueValues")}) : _)
.WithColumn(newColumnName, #sql{Variable("includeOthers")})
.Select([valueColumnName, newColumnName]);
  allValues = allValues.Append(otherValues)
}
df.Join(allValues, [columnName: valueColumnName], "inner")
  .RemoveColumns([valueColumnName]);
  }
}
// The following example will return only rows containing two of the most common values for Region-column.
// let df = DataTableById(2).SqlDataFrame;
// df = ColumnWithMinUsage(df, "Region", "_Filtered", 2, null, null);
// df.Collect().ToCsv();
// The following example will return all input rows, but will replace the values of rows whose Region-column
// has a value used by less than 15% of all the rows with a new value: "_Others".
// let df = DataTableById(2).SqlDataFrame;
// df = ColumnWithMinUsage(df, "Region", "_Filtered", null, 0.15, "_Others");
// df.Collect().ToCsv();
</syntaxhighlight>
</syntaxhighlight>

Revision as of 10:56, 13 June 2023

This page contains script examples written in the QPR ProcessAnalyzer expression language. See how expression scripts can be created in the Workspace. For documentation for the syntax, functions and entities can be found from the main page in the KPI Expression Language section.

Call web service

Contact to a web service, fetch some data, and store it to a datatable.

let datatableName = "Web Service Data";
let webServiceData = CallWebService(
    #{"Address": "https://processanalyzer.onqpr.com/qprpa/api/serverinfo"}
);

let targetDatatable = Project.Datatables.Where(name==datatableName);
if (Count(targetDatatable) == 0) {
	targetDatatable = Project.CreateDatatable(datatableName)
	.AddColumn("Setting name", "String")
	.AddColumn("Setting value", "String")
	.AddColumn("Data read", "DateTime");
} else {
	targetDatatable = targetDatatable[0];
}

let currentTime = Now;
let dataAsDf = ToDataFrame(
	webServiceData.keys.{
        let key = _;
        [key, webServiceData[key], currentTime];
    },
	["Setting name", "Setting value", "Data read"]
);
targetDatatable.Import(dataAsDf, #{"Append":true});
WriteLog(`${CountTop(dataAsDf.Rows)} rows written to datatable`);

Store data to datatable

Get all models in the system and store them to a datatable.

let newDatatable = Project
    .CreateDatatable("Models list " + ToString(Now, "dd.MM.yyyy HH:mm:ss"))
    .AddColumn("Model name", "String")
    .AddColumn("Project name", "String")
    .AddColumn("Created time", "DateTime")
    .AddColumn("Cases", "Integer");
let startTime = Now;
let modelsData = ToDataFrame(
    Models.([Name, Project.Name, CreatedDate, NCases]),
    ["Model name", "Project name", "Created time", "Cases"]
);
WriteLog(`Listing models took ${(Now - startTime).TotalSeconds.Round(2)} seconds.`);
newDatatable.Import(modelsData);
WriteLog(`Datatable ${newDatatable.Id} created.`);

Convert datatable column data

This script can be used to convert a single column into numerical data type. To use the script, you need to setup the following in the beginning of the script:

  • Project name where the datatable is located.
  • Datatable name
  • Name of the column to be converted

Note that the conversion fails, if there is data that cannot be converted into numerical format. The conversion assumes that period (.) is used as the decimal point.

let projectName = "New Project";
let datatableName = "qpr processanalyzer events";
let columnName = "Event order in case";

let project = (Projects.Where(Name==projectName))[0];
let datatable = (project.Datatables.Where(Name==datatableName))[0];
DatatableById(datatable.Id).DataFrame
.SetColumns([
	columnName: () => {
		let data = Column(columnName);
		if (data == null) {
			null;
		 } else {
			ToFloat(data);
		}
	}
])
.Persist(datatable.Name, ["ProjectId": project.Id, "Append": false]);

Instead of converting to numeric (with the ToFloat function), data can be converted into string using the ToString function.

Show DataFrame as HTML table

This script defines a function to show dataframe as a HTML table, and uses the function for a literal dataframe.

function dataframeToHtmlTable(df) {
	return
`<table>
	<tr>
		${StringJoin("\r\n\t\t",  + df.columns.`<th>${_}</th>`)}
	</tr>
	${StringJoin("", df.Rows.(
		"\r\n\t<tr>" + StringJoin("", _.`\r\n\t\t<td>${ToString(_)}</td>`) + "\r\n\t</tr>"
	))}
</table>`
}

let data = ToDataFrame(
	[
		["one", "two", "three"],
		["four", "five", "six"],
		["seven", "eight", "nine"]
	],
	["Column 1", "Column 2", "Column 3"]
);

return dataframeToHtmlTable(data);

Copy local datatables to Snowflake

// Copies all datatables in a project to another project including datatable contents.
// Usage instructions:
// 1. Create expression script in the project from where you want to copy the datatables.
// 2. Create a new project named as "<name of the project to be moved> - Snowflake". New datatables will be created here. E.g., when moving project named "SAP_OrderToCash", the target project should be named as "SAP_OrderToCash - Snowflake".
// 3. Run the script.
// NOTE: Columns of type "Any" will be created as "String"-columns in Snowflake, thus it is recommended that actual data types are set for the tables prior to the move.

let sourceProject = Project;
let sourceProjectName = Project.Name;
let targetProjectName = `${sourceProjectName} - Snowflake`;
let targetProject = First(Projects.Where(Name == targetProjectName));
if (IsNull(targetProject)) {
  WriteLog(`Unable to find target project named "${targetProjectName}". Aborting operation.`);
  return;
}
let dts = sourceProject.DataTables;
WriteLog(`Copying all ${CountTop(dts)} data tables found in project "${sourceProject.Name}" (id: ${sourceProject.Id}) to Snowflake in project "${targetProject.Name}" (id: ${targetProject.Id})`);
dts.{
  let sourceDt = _;
  WriteLog(`Starting to copy data table "${Name}" (id: ${Id}) having ${NRows} rows and ${NColumns} columns.`);
  let targetDt;
  targetDt = targetProject.DatatableByName(sourceDt.Name);
  if (targetDt == null) {
    targetDt = targetProject.CreateDataTable(sourceDt.Name, #{"Connection": CreateSnowflakeConnection(#{"ProjectId": targetProject.Id})});
    targetDt.Import(sourceDt.SqlDataFrame);
    WriteLog(`Finished copying data table "${Name}" (id: ${Id}) to table "${targetDt.Name}" (id: ${targetDt.Id})`);
  } else {
    WriteLog(`Datatable already exist "${Name}" (id: ${Id}) to table "${targetDt.Name}" (id: ${targetDt.Id})`);
  }
}
WriteLog(`Finished copying all the data tables found in project "${sourceProject.Name}" (id: ${sourceProject.Id}) to Snowflake in project "${targetProject.Name}" (id: ${targetProject.Id})`);

If you don't need to copy the data but only create the Snowflake datatables with columns, you can change the line 22 to

targetDt.Import(sourceDt.SqlDataFrame.head(0));

Copy single datatable to Snowflake

This script creates a copy of a single datatable to Snowflake. Replace the <tableId1> with the id of the source datatable.

function CopyDataTableToSnowflake(dataTableId)
{
  let sourceDt = DataTableById(dataTableId);
  sourceDt.SqlDataFrame.Persist(`${sourceDt.Name} - Snowflake`, #{"Append": false, "Connection": CreateSnowflakeConnection(#{"ProjectId": sourceDt.Project.Id})});
}
CopyDataTableToSnowflake(<tableId1>);

Create a copy of a data table that has all Any-type columns changed to String-type columns

function ConvertAnyDataTypesToStringsToNewTable(dataTableId)
{
  let dt = DataTableById(dataTableId);
  let sdf = dt.SqlDataFrame;
  let cts = dt.ColumnTypes;
  cts.{
    let ct = _;
    if (ct.DataType == "Any") {
      let n = ct.Name;
      sdf = sdf.WithColumn(ct.Name, #sql{Cast(Column(Variable("n")), "ShortString")});
    }
  };
  sdf.Persist(`${dt.Name} - Converted`, #{"Append": false, "ProjectId": dt.Project.Id});
}
ConvertAnyDataTypesToStringsToNewTable(<dataTableId>);

Query number of rows in given data table having a datetime value in given year grouped by month and return resulting table as CSV

SqlDataFrame is used in order to prevent loading the whole datatable into memory first. Filtering is performed as first operation in order to minimize the amount of required work for the data source of the data table.

DataTableById(<data table id>)
  .SqlDataFrame
  .Where(#sql{2014 == Year(Column("Start Time"))})
  .WithColumn("Month", #sql{Month(Column("Start Time"))})
  .GroupBy(["Month"]).Aggregate(["Count"], ["Count"])
  .OrderByColumns(["Month"], [true])
  .Collect().ToCsv();

Function for filtering SqlDataFrame by removing rows having, or replacing, the most infrequently occurring column values

/***
 * @name ColumnWithMinUsage
 * @descripion
 * Generic function that can be used to filter out the most infrequently occurring attribute values or replace their Values
 * with given common value.
 * @param df:
 * DataFrame to operate on.
 * @param columnName:
 * Name of the column to be filtered.
 * @param newColumnName:
 * Name of the column that will contain the new value of the original column after filtering (if includeOthers was applied).
 * @param maxNumUniqueValues:
 * Maximum number of unique values to include into the comparison for each attribute column. If the amount of unique values for any attribute exceeds this value, only given number of attributes are included that have the highest usage.
 * @param minValueUsage:
 * Minimum total usage of a value included into the comparison. The number of cases having every returned value should be at least given percentage (a float value between 0.0 and 1.0) of all the compared cases.
 * @param includeOthers:
 * Should the rest of the attribute values not included due to MinValueUsage or MaxNumUniqueValues filtering be included as an aggregated "Others" value?
 * If not empty/null, defines the name used for these other-values.
 */
function ColumnWithMinUsage(df, columnName, newColumnName, maxNumUniqueValues, minValueUsage, includeOthers)
{
  let all = df
	.GroupBy([])
	.Aggregate(["NAllTotal"], ["Count"])
	.WithColumn("__Join2", #sql{1});
  let minValueUsageEnabled = !IsNullTop(minValueUsage);
  let maxNumUniqueValuesEnabled = !IsNullTop(maxNumUniqueValues);
  if (minValueUsageEnabled || maxNumUniqueValuesEnabled) {
	// Perform column value-based filtering if minValueUsageEnabled or maxNumUniqueValuesEnabled is defined.
    let valueColumnName = "__ValueNew";
	let filteredValuesColumns = [valueColumnName: columnName];
	let filteredValues = df
	  .GroupBy([columnName]).Aggregate(["Count"], ["Count"]);
	if (minValueUsageEnabled) {
	  filteredValues = filteredValues
		.WithColumn("__Join", #sql{1})
		.Join(all, ["__Join": "__Join2"], "leftouter")
        .WithColumn("Usage", #sql{Column("Count") / Column("NAllTotal")});
	  filteredValuesColumns = Concat(filteredValuesColumns, ["Usage"]);
	}
	if (maxNumUniqueValuesEnabled) {
	  filteredValues = filteredValues
		.WithRowNumberColumn("RowNumber", ["Count"], null, [false]);
	  filteredValuesColumns = Concat(filteredValuesColumns, ["RowNumber"]);
	}

	filteredValues = filteredValues
	  .Select(filteredValuesColumns);

	// Generate select returning all the accepted values.
	let allValues = filteredValues
	  .(minValueUsageEnabled ? Where(#sql{Column("Usage") >= Variable("minValueUsage")}) : _)
	  .(maxNumUniqueValuesEnabled ? Where(#sql{Column("RowNumber") <= Variable("maxNumUniqueValues")}) : _)
	  .Select([valueColumnName, newColumnName: valueColumnName]);

	if (!IsNullTop(includeOthers)) {
	  // If includeOthers is defined, replace original values with the variable defined in includeOthers.
	  let otherValues = filteredValues
		.(minValueUsageEnabled ? Where(#sql{Column("Usage") < Variable("minValueUsage")}) : _)
		.(maxNumUniqueValuesEnabled ? Where(#sql{Column("RowNumber") > Variable("maxNumUniqueValues")}) : _)
		.WithColumn(newColumnName, #sql{Variable("includeOthers")})
		.Select([valueColumnName, newColumnName]);
	  allValues = allValues.Append(otherValues)
	}
	df.Join(allValues, [columnName: valueColumnName], "inner")
	  .RemoveColumns([valueColumnName]);
  }
}

// The following example will return only rows containing two of the most common values for Region-column.
// let df = DataTableById(2).SqlDataFrame;
// df = ColumnWithMinUsage(df, "Region", "_Filtered", 2, null, null);
// df.Collect().ToCsv();

// The following example will return all input rows, but will replace the values of rows whose Region-column
// has a value used by less than 15% of all the rows with a new value: "_Others".
// let df = DataTableById(2).SqlDataFrame;
// df = ColumnWithMinUsage(df, "Region", "_Filtered", null, 0.15, "_Others");
// df.Collect().ToCsv();