00001
00009 #include "party.h"
00010
00011
00033 void C_split(const double *x, int p,
00034 const double *y, int q,
00035 const double *weights, int n,
00036 const int *orderx,
00037 SEXP splitctrl, SEXP linexpcov2sample,
00038 SEXP expcovinf, double *cutpoint, double *maxstat,
00039 double *statistics) {
00040
00041 double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar,
00042 tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp;
00043 double minobs, maxobs, xmax;
00044 int lastj, i, j, k;
00045
00046 if (p != 1) error("C_split: p not equal to one");
00047 tol = get_tol(splitctrl);
00048
00049
00050
00051
00052 xmax = 0.0;
00053 for (i = 0; i < n; i++) {
00054 statistics[i] = 0.0;
00055 if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i];
00056 }
00057
00058
00059
00060 dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym));
00061 dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym));
00062 sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0];
00063
00064
00065 if (sweights > 1) {
00066
00067
00068
00069 minprob = get_minprob(splitctrl);
00070 minbucket = get_minbucket(splitctrl);
00071 minobs = sweights * minprob + 1.0;
00072
00073 if (minobs < minbucket)
00074 minobs = minbucket;
00075 maxobs = sweights * (1 - minprob) - 1.0;
00076 if (maxobs > sweights - minbucket)
00077 maxobs = sweights - minbucket;
00078
00079 f1 = (double) sweights / (sweights - 1);
00080 f2 = 1.0 / (sweights - 1);
00081 w = 0.0;
00082
00083
00084 dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym));
00085 for (k = 0; k < q; k++) dlinstat[k] = 0.0;
00086 dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym));
00087 dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym));
00088
00089 tx = 0.0;
00090 lastj = 0;
00091
00092
00093 for (i = 0; i < (n - 1); i++) {
00094
00095
00096 j = orderx[i] - 1;
00097
00098
00099 if (weights[j] == 0.0) continue;
00100
00101
00102 if (w > 0 && x[j] < tx)
00103 warning("C_split: inconsistent ordering: %f < %f!\n",
00104 x[j], tx);
00105
00106
00107
00108 if (w > 0 && x[j] == tx)
00109 statistics[lastj] = 0.0;
00110
00111
00112 tx = x[j];
00113 lastj = j;
00114
00115 w += weights[j];
00116
00117
00118 if (w >= maxobs || x[j] >= xmax) break;
00119
00120
00121
00122 for (k = 0; k < q; k++)
00123 dlinstat[k] += y[n * k + j] * weights[j];
00124
00125 if (w > minobs) {
00126 for (k = 0; k < q; k++)
00127 dexpect[k] = w * dExp_y[k];
00128
00129 f1w = f1 * w;
00130 f2ww = f2 * w * w;
00131 for (k = 0; k < q*q; k++)
00132 dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k];
00133 } else {
00134 continue;
00135 }
00136
00137
00138
00139
00140
00141
00142 statistics[j] = 0.0;
00143 for (k = 0; k < q; k++) {
00144 if (dcovar[k * q + k] <= tol) continue;
00145 tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]);
00146 if (statistics[j] < tmp) statistics[j] = tmp;
00147 }
00148
00149 }
00150
00151
00152 maxstat[0] = 0.0;
00153 for (i = 0; i < (n - 1); i++) {
00154 if (statistics[i] > maxstat[0]) {
00155 maxstat[0] = statistics[i];
00156 cutpoint[0] = x[i];
00157 }
00158 }
00159 }
00160 }
00161
00162
00175 SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample,
00176 SEXP expcovinf, SEXP splitctrl) {
00177
00178 SEXP ans, cutpoint, maxstat, statistics;
00179
00180 PROTECT(ans = allocVector(VECSXP, 3));
00181 SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00182 SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00183 SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00184
00185 C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00186 INTEGER(orderx), splitctrl, linexpcov2sample, expcovinf,
00187 REAL(cutpoint), REAL(maxstat), REAL(statistics));
00188 UNPROTECT(1);
00189 return(ans);
00190 }
00191
00192
00217 void C_splitcategorical(const int *codingx, int p,
00218 const double *y, int q,
00219 const double *weights, int n,
00220 double *standstat,
00221 SEXP splitctrl, SEXP linexpcov2sample,
00222 SEXP expcovinf, double *cutpoint, int *levelset,
00223 double *maxstat, double *statistics) {
00224
00225 double tol, *tmpx, *tmptmpx, tmp = 0.0;
00226 int *irank, *ordertmpx, i, j, k, l, jp, chk;
00227
00228 tol = get_tol(splitctrl);
00229
00230
00231 tmpx = Calloc(n, double);
00232 ordertmpx = Calloc(n, int);
00233 irank = Calloc(p, int);
00234 tmptmpx = Calloc(n, double);
00235
00236
00237 for (j = 0; j < q; j++) {
00238
00239 jp = j * p;
00240
00241
00242
00243
00244 for (k = 0; k < p; k++) {
00245 irank[k] = 1;
00246 for (l = 0; l < p; l++)
00247 if (standstat[jp + l] < standstat[jp + k]) irank[k]++;
00248 }
00249
00250
00251 for (i = 0; i < n; i++) {
00252 tmpx[i] = (double) irank[codingx[i] - 1];
00253 tmptmpx[i] = tmpx[i];
00254 ordertmpx[i] = i + 1;
00255 }
00256
00257
00258 rsort_with_index(tmptmpx, ordertmpx, n);
00259
00260
00261 C_split(tmpx, 1, y, q, weights, n, ordertmpx,
00262 splitctrl, linexpcov2sample,
00263 expcovinf, cutpoint, maxstat, statistics);
00264
00265
00266
00267 chk = 0;
00268 if (maxstat[0] > tmp) {
00269 for (k = 0; k < p; k++) {
00270 if (irank[k] > cutpoint[0]) {
00271 levelset[k] = 1;
00272 chk += 1;
00273 } else {
00274 levelset[k] = 0;
00275 }
00276 }
00277 tmp = maxstat[0];
00278 }
00279
00280
00281
00282
00283
00284
00285
00286 }
00287 maxstat[0] = tmp;
00288
00289
00290 Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx);
00291 }
00292
00293
00307 SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights,
00308 SEXP linexpcov2sample, SEXP linexpcov,
00309 SEXP expcovinf, SEXP splitctrl) {
00310
00311 SEXP ans, cutpoint, maxstat, statistics, levelset;
00312 double *standstat;
00313
00314 C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00315 1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
00316
00317 standstat = Calloc(get_dimension(linexpcov), double);
00318 C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)),
00319 REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
00320 REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
00321 get_dimension(linexpcov), get_tol(splitctrl), standstat);
00322
00323 PROTECT(ans = allocVector(VECSXP, 4));
00324 SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00325 SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00326 SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00327 SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x)));
00328
00329 C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights),
00330 nrow(x), standstat,
00331 splitctrl, linexpcov2sample, expcovinf,
00332 REAL(cutpoint), INTEGER(levelset), REAL(maxstat),
00333 REAL(statistics));
00334
00335 UNPROTECT(1);
00336 Free(standstat);
00337 return(ans);
00338 }