attached you will find a short example of the perturbation tests on consistency we did. I produced several results using the data set dat <- dat[,c("sex","age","income"),]
.
I ran the perturbation four times. On the one hand, I distinguished between a complete data set dat1 <- dat
and a reduced data set dat2 <- dat[income>0,,]
. Then, I compared the built-in key generation (a) ck_create_input(..., def_rkey=9, ...)
vs. (b) pre-drawn keys ck_create_input(..., def_rkey="rkeys_predrawn",...)
.
Is it possible to solve the issues? In case of the consistency issue for continuous variables - that may be also of interest for @ppdewolf - Sarah solved the issue within an destatis-internal prototype. She multiplied the original record key by a dichotomous indicator variable: 0=no contribution of the continuous variable (e.g. income=0) such that the cell key will be identical to the case that users exclude such units from the data set, and 1=otherwise. May that be a solution for both of you?
# Cell Key
library(cellKey)
## The data set (sex, age, income)
dat <- ck_create_testdata()
dat <- dat[,c("sex","age","income"),]
dim.sex <- data.table(levels=c("@","@@","@@"), codes=c("Total", "male","female"))
dim.age <- data.table(levels=c("@",rep("@@", 6)), codes=c("Total", paste0("age_group",1:6)))
dimList <- list(sex=dim.sex, age=dim.age)
weightVar <- NULL
numVars <- c("income")
# parameters
pTable_destatis <- ck_create_pTable(type="destatis")
sTable <- ck_generate_sTable(smallC=12)
mTable <- c(0.6,0.4,0.2)
pert_params_destatis <- ck_create_pert_params(
smallN=12,
pTable=pTable_destatis,
sTable=sTable,
mTable=mTable)
# Externally predrawn record keys
set.seed(123)
dat$rkeys_predrawn <- round(runif(dim(dat)[1]), 9)
## Two data set ('dat1' with all individuals, and 'dat2' with those which have positive income)
# i.e. one unit will be excluded in dat2 (sex=1, age=3 and income=0)
dat[income==0,,]
dat1 <- dat
dat2 <- dat[income>0,,]
## (a) Both data sets using the implemted key generation
inp_destatis_1a <- ck_create_input(
dat=dat1,
def_rkey=9,
pert_params=pert_params_destatis)
inp_destatis_2a <- ck_create_input(
dat=dat2,
def_rkey=9,
pert_params=pert_params_destatis)
res_destatis_1a <- perturbTable(
inp=inp_destatis_1a,
dimList=dimList,
weightVar=weightVar, numVars=numVars)
res_destatis_2a <- perturbTable(
inp=inp_destatis_2a,
dimList=dimList,
weightVar=weightVar, numVars=numVars)
## (b) Both data sets using externally generated keys
inp_destatis_1b <- ck_create_input(
dat=dat1,
def_rkey="rkeys_predrawn",
pert_params=pert_params_destatis)
inp_destatis_2b <- ck_create_input(
dat=dat2,
def_rkey="rkeys_predrawn",
pert_params=pert_params_destatis)
res_destatis_1b <- perturbTable(
inp=inp_destatis_1b,
dimList=dimList,
weightVar=weightVar, numVars=numVars)
res_destatis_2b <- perturbTable(
inp=inp_destatis_2b,
dimList=dimList,
weightVar=weightVar, numVars=numVars)
## Results (a)
res_destatis_1a@tab[,cellKey:=res_destatis_1a@cellKeys,]
res_destatis_2a@tab[,cellKey:=res_destatis_2a@cellKeys,]
res_destatis_1a@tab
res_destatis_2a@tab
## Results (b)
res_destatis_1b@tab[,cellKey:=res_destatis_1b@cellKeys,]
res_destatis_2b@tab[,cellKey:=res_destatis_2b@cellKeys,]
res_destatis_1b@tab
res_destatis_2b@tab