Open R in QPR ProcessAnalyzer

From QPR ProcessAnalyzer Wiki
Jump to navigation Jump to search


Install SQL Server Express Advanced

  1. Download Microsoft® SQL Server® 2016 Express from https://www.microsoft.com/en-us/download/details.aspx?id=52679
  2. Run
  3. Select "custom" installation type
  4. Change folder or just click "Install"
  5. Click "New SQL Server stand-alone installation or add features to an existing installation"
  6. Accept the license terms and click next
  7. Next
  8. Accept and next
  9. All except "localDB" are selected and click next
  10. Set instance name and id as "QPRPA_SQLEXPRESS" and next
    OpenRDBInstanceConfiguration.png
  11. Next
    OpenRDBConfiguration.png
  12. Next
  13. Next
  14. Accept and next
  15. Close
  16. Install SQL Server Management Studio click SSMS (Needed for enabling R)

R Configuration

  1. Allow ETL script to install (And remove) R-packages:
    • Permission configuration for XPRESS installation:
    C:\Program Files\Microsoft SQL Server\MSSQL13.QPRPA_SQLEXPRESS\R_SERVICES\library
    Give folder 'Write' and 'Modify' Permission to group 'SQLRUserGroupQPRPA_SQLEXPRESS'
    UserPermissions.png
    • Permission configuration for SERVER installation:
    C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\R_SERVICES\library
    Give folder 'Write' and 'Modify' Permission to group 'SQLRUserGroup'
    If permission missing then following error is possible: SQL error: A 'R' script error occurred during execution of 'sp_execute_external_script' with HRESULT 0x80004004.
  2. "Enable R" (https://msdn.microsoft.com/en-us/library/mt653951.aspx)
    • Add sandbox user db_owner to QPR_PA_SB_USER. This is only valid for server configuration (not xpress user).
    USE [QPR_PA_SANDBOX]
    GO
    ALTER ROLE [db_owner] ADD MEMBER [QPR_PA_SB_USER]
    • To enable R scripts, an administrator can run the following statement in SQL Server Management Studio:
    sp_configure 'external scripts enabled',1
    reconfigure with override
    • Restart service: SQL Server (QPRPA_SQLEXPRESS)
    OpenRServiceManager2.png
  3. Setup XPRESS sandbox connection string:
    • From XPress version, click on Start button and then open Session Information windows dialog
    • Click on "Configure Scripting..." button:
    ConfigureScripting2.PNG
    • Edit the Scripting Database and enter the specific connection string e.g.:
    Data Source=localhost\QPRPA_SQLEXPRESS;Integrated Security=true;
    • Click on OK and Close buttons
  4. Script test R-functionality
    An administrator can run the following statement in SQL Server Management Studio without error to confirm that the configuration is completed:
    exec sp_execute_external_script @language =N'R',
    @script=N'
    install.packages("vegan")
    library("vegan")
    iris.c <- scale(iris[ ,1:4])
    pca <- rda(iris.c)
    OutputDataSet<-iris',
    @input_data_1 =N'select 1 as hello'
    with result sets (([Sepal.Length] float, [Sepal.Width] float, [Petal.Length] float, [Petal.Width] float, [Species] nvarchar(100)));
    Note: if the message shows: "SQL Server was unable to communicate with the LaunchPad service. Please verify the configuration of the service. "
    http://dba.stackexchange.com/questions/120205/msg-39011-sql-server-was-unable-to-communicate-with-the-launchpad-service
    Click the Advanced tab and edit the value of External Users Count if necessary. This setting controls how many different SQL users can run queries concurrently in R. The default is 20 accounts, which in most cases is more than adequate to support R sessions.

Verify Basic R Configuration

Execute the following script to verify basic R configuration:

exec sp_execute_external_script  @language =N'R', 
@script=N'
max(1)',   
@input_data_1 =N'select 1 as hello' ;

Clustering Example Script

The script below is an example script for clustering the cases into two clusters based on case attributes. It used k-prototypes clustering algorithm from the R package "clustMixType". This algorithm is a variation of k-means clustering with the difference that it takes into account numeric and factor variables for calculating the differences between cases. After the R script is run, a new case attribute "Cluster" is imported into the model, which contains the index of the cluster each case belongs to:

(SELECT 'AnalysisType', '5') UNION ALL
(SELECT 'ModelId', CAST(@_ModelId as NVARCHAR(255))) UNION ALL
--(SELECT 'SelectedActivityCounts','*') UNION ALL
(SELECT 'SelectedCaseAttributes', '*') UNION ALL
(SELECT 'MaximumCount', '0') UNION ALL
(SELECT 'TargetTable', '#Cases') UNION ALL
(SELECT 'UseSqlVariants', 'false')
--#GetAnalysis

declare @numberOfClusters varchar(255);
set @numberOfClusters = '2';

declare @clusteringScript nvarchar(max);
set @clusteringScript = N'
install.packages("clustMixType", dependencies=TRUE, repos="http://cran.rstudio.com/")
library("clustMixType")
data <- InputDataSet[, !(names(InputDataSet) %in% c("Name", "X.Id.", "Start.Time", "End.Time", "Cluster", "Duration.Days"))]
factorColumns <- sapply(data[, sapply(data, is.factor)], as.character)
factorColumns[is.na(factorColumns)] <- "NULL"
data[, sapply(data, is.factor)] <- data.frame(factorColumns, stringsAsFactors = TRUE)
clustering <- kproto(data.frame(data), ' + @numberOfClusters + ', lambda = 50)
caseClusters <- data.frame(InputDataSet[, "Name"])
caseClusters$Cluster <- as.factor(clustering$cluster)
OutputDataSet<-caseClusters';


(SELECT 'ModelId', CAST(@_ModelId as NVARCHAR(255))) UNION ALL
(SELECT 'Append', 'True')
exec sp_execute_external_script @language =N'R',  
@script=@clusteringScript, 
@input_data_1 = N'SELECT * FROM #Cases'
with result sets ((Name nvarchar(max), Cluster int));
--#ImportCaseAttributes
  • Note: Loading Data from ODBC Datasources not support --#ImportCaseAttributes function.
  • Note 2: Script for manual library loading:
exec sp_execute_external_script  @language =N'R', 
@script=N'
install.packages("clustMixType", dependencies=TRUE, repos="http://cran.rstudio.com/")',
@input_data_1 =N'select 1 as hello' ;

Case Prediction Example Script

The script below uses gradient boosting classification algorithm from the R packages 'caret' and 'gbm' for making predictions of a case attribute with discrete values. The 'caret' package also allows using other classification algorithms and specifying parameters for predictions. The script uses supervised learning algorithm for making the predictions. The learning data set for the algorithm contains case attributes information and event type amounts which are available by taking Case Analysis with all case attributes and all event type counts included. The script supports predictors (i.e. all other case attributes than the one for which the predictions are made) with both continuous and discrete values.

Parameters

'@caseAttributeName'
The name of the actual case attribute for which predictions are made. The case attribute should have discrete values and some values should already exist, as the prediction is done for the missing values.
'@targetCaseAttributeName'
The name of the target case attribute where the predicted values are stored.
(SELECT 'AnalysisType', '5') UNION ALL
(SELECT 'FilterId', CAST(@_FilterId as NVARCHAR(255))) UNION ALL
(SELECT 'SelectedCaseAttributes', '*') UNION ALL
(SELECT 'SelectedActivityCounts', '*') UNION ALL
(SELECT 'MaximumCount', '0') UNION ALL
(SELECT 'TargetTable', '#Cases') UNION ALL
(SELECT 'UseSqlVariants', '0')
--#GetAnalysis

declare @caseAttributeName nvarchar(255);
set @caseAttributeName = 'Happy Customer';
set @caseAttributeName = replace(@caseAttributeName, ' ', '.');

declare @targetCaseAttributeName nvarchar(255);
set @targetCaseAttributeName = 'Predicted Happy Customer';

declare @script nvarchar(max);
set @script = N'
install.packages(''caret'', dependencies = TRUE)
install.packages(''gbm'', dependencies = TRUE)
library(caret)

cases <- InputDataSet[, !(names(InputDataSet) %in% c("Name", "X.Id.", "Start.Time", "End.Time", "Event.Cost", "' + replace(@targetCaseAttributeName, ' ', '.') +  '"))]
output <- data.frame(InputDataSet[, "Name"])
caseAttrIndex <- which(colnames(cases)=="' + @caseAttributeName + '")
if (length(caseAttrIndex) == 0)
{
  print("Case attribute ''' + @caseAttributeName + ''' does not exist")
  OutputDataSet<-data.frame("Error", "Error")
} else {

  naInd <- as.numeric(rownames(cases[is.na(cases[,caseAttrIndex]) | cases[,caseAttrIndex] == "NULL",]))
  notNaInd <- as.numeric(rownames(cases[!is.na(cases[,caseAttrIndex]) & cases[,caseAttrIndex] != "NULL",]))

  if (length(notNaInd) > 0)
  {
    #classifier <- train(cases[notNaInd, -caseAttrIndex], factor(cases[notNaInd, caseAttrIndex]), method = "nb", trControl = trainControl(method = "none"), tuneGrid = data.frame(usekernel=FALSE,fL=0))
    classifier <- train(cases[notNaInd, -caseAttrIndex], factor(cases[notNaInd, caseAttrIndex]), method = "gbm", 
                         trControl = trainControl(method="boot", number=1, savePredictions="final",
                                                  index=createResample(factor(cases[notNaInd, caseAttrIndex]), 1)))
    #classifier <- train(cases[notNaInd, -caseAttrIndex], factor(cases[notNaInd, caseAttrIndex]), method = "gbm")


    predictedTrain <- predict(classifier, cases[notNaInd, -caseAttrIndex])
    errorsTrain <- table(predictedTrain, factor(cases[notNaInd, caseAttrIndex]))
    confusionMatrixTrain <- confusionMatrix(predictedTrain, factor(cases[notNaInd, caseAttrIndex]))

    output[notNaInd, 2] <- as.factor(predictedTrain)

    if (length(naInd) > 0)
    {
      predicted <- predict(classifier, cases[naInd, -caseAttrIndex])
      output[naInd, 2] <- as.factor(predicted)
    }
    print(confusionMatrixTrain$overall["Accuracy"])
  } else {
    output[, 2] <- NA
  }
  OutputDataSet<-output
}
';

DECLARE @ParmDefinition nvarchar(500);  
SET @ParmDefinition = N'@classificationScript nvarchar(max), @query nvarchar(255)'; 

declare @casesQuery nvarchar(255);
set @casesQuery = N'SELECT * FROM #Cases';

declare @executeRSql nvarchar(4000);
set @executeRSql = '
exec sp_execute_external_script @language =N''R'',  
@script=@classificationScript, 
@input_data_1 = @query
with result sets ((Name nvarchar(max), [' + @targetCaseAttributeName + '] nvarchar(max)));'

(SELECT 'ModelId', CAST(@_ModelId as NVARCHAR(255))) UNION ALL
(SELECT 'Append', 'True')
EXECUTE sp_executesql @executeRSql, @ParmDefinition, @classificationScript = @script, @query = @casesQuery;
--#ImportCaseAttributes
  • Note: Loading Data from ODBC Datasources not support --#ImportCaseAttributes function.