Main Page | Directories | File List | File Members | Related Pages

Splits.c

Go to the documentation of this file.
00001 
00009 #include "party.h"
00010 
00011 
00033 void C_split(const double *x, int p,
00034              const double *y, int q,
00035              const double *weights, int n,
00036              const int *orderx,
00037              SEXP splitctrl, SEXP linexpcov2sample, 
00038              SEXP expcovinf, double *cutpoint, double *maxstat, 
00039              double *statistics) {
00040 
00041     double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar, 
00042            tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp;
00043     double minobs, maxobs, xmax;
00044     int lastj, i, j, k;
00045 
00046     if (p != 1) error("C_split: p not equal to one");
00047     tol = get_tol(splitctrl);
00048 
00049     /* init statistics and determine the maximal value with positive weight 
00050        since we can't choose this one as cutpoint
00051     */
00052     xmax = 0.0;
00053     for (i = 0; i < n; i++) {
00054         statistics[i] = 0.0;
00055         if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i];
00056     }
00057 
00058     /* we already have expecation and covariance of the response
00059      * values and the sum of the weights */
00060     dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym));
00061     dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym));
00062     sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0];
00063 
00064     /* if there is something to split */
00065     if (sweights > 1) {
00066 
00067         /* we need to ensure that at least minbucket weights 
00068            are there to split (either left or right) */
00069         minprob = get_minprob(splitctrl);
00070         minbucket = get_minbucket(splitctrl);
00071         minobs = sweights * minprob + 1.0;
00072 
00073         if (minobs < minbucket) 
00074             minobs = minbucket; 
00075         maxobs = sweights * (1 - minprob) - 1.0;
00076         if (maxobs > sweights - minbucket) 
00077             maxobs = sweights - minbucket; 
00078 
00079         f1 = (double) sweights / (sweights - 1);
00080         f2 = 1.0 / (sweights - 1);
00081         w = 0.0;
00082     
00083         /* pointers to the R-objects */
00084         dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym));
00085         for (k = 0; k < q; k++) dlinstat[k] = 0.0;
00086         dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym));
00087         dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym));
00088 
00089         tx = 0.0;
00090         lastj = 0;
00091 
00092         /* for all possible cutpoints (defined by the observations x) */
00093         for (i = 0; i < (n - 1); i++) {
00094     
00095             /* the ordering of the ith observation */
00096             j = orderx[i] - 1;
00097         
00098             /* if the corresponding weight is zero */
00099             if (weights[j] == 0.0) continue;
00100 
00101             /* just a check: can be removed later */
00102             if (w > 0 && x[j] < tx)
00103                 warning("C_split: inconsistent ordering: %f < %f!\n", 
00104                         x[j], tx);
00105         
00106             /* handle ties: delete the entry of the last visited observation
00107                (take care of zero weights!) */
00108             if (w > 0 && x[j] == tx)
00109                 statistics[lastj] = 0.0; 
00110 
00111             /* store the value and position of the j smallest observation */
00112             tx = x[j];
00113             lastj = j;
00114         
00115             w += weights[j];
00116 
00117             /* do not consider those splits */
00118             if (w >= maxobs || x[j] >= xmax) break;
00119 
00120             /* compute the linear statistic and expectation and 
00121              * covariance if needed */
00122             for (k = 0; k < q; k++)
00123                 dlinstat[k] += y[n * k + j] * weights[j];
00124  
00125             if (w > minobs) {
00126                 for (k = 0; k < q; k++)
00127                     dexpect[k] = w * dExp_y[k];
00128 
00129                 f1w = f1 * w;
00130                 f2ww = f2 * w * w;
00131                 for (k = 0; k < q*q; k++)
00132                     dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k];
00133             } else {
00134                 continue;
00135             }
00136         
00137             /* the absolute standardized test statistic, to be maximized */
00138             /* statistics[j] = C_maxabsTestStatistic(dlinstat, 
00139                    dexpect, dcovar, q, tol); */
00140 
00141             /* much faster but uses maxabs always*/
00142             statistics[j] = 0.0;
00143             for (k = 0; k < q; k++) {
00144                 if (dcovar[k * q + k] <= tol) continue;
00145                 tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]);
00146                 if (statistics[j] < tmp) statistics[j] = tmp;
00147             }
00148 
00149         }
00150     
00151         /* search for the maximum and the best separating cutpoint */
00152         maxstat[0] = 0.0;        
00153         for (i = 0; i < (n - 1); i++) {
00154             if (statistics[i] > maxstat[0]) {
00155                 maxstat[0] = statistics[i];
00156                 cutpoint[0] = x[i];
00157             }
00158         }
00159     }
00160 }
00161 
00162 
00175 SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample, 
00176              SEXP expcovinf, SEXP splitctrl) {
00177              
00178     SEXP ans, cutpoint, maxstat, statistics;
00179     
00180     PROTECT(ans = allocVector(VECSXP, 3));
00181     SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00182     SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00183     SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00184     
00185     C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00186             INTEGER(orderx), splitctrl, linexpcov2sample, expcovinf,
00187             REAL(cutpoint), REAL(maxstat), REAL(statistics));
00188     UNPROTECT(1);
00189     return(ans);
00190 }
00191 
00192 
00217 void C_splitcategorical(const int *codingx, int p,
00218                         const double *y, int q,
00219                         const double *weights, int n,
00220                         double *standstat,
00221                         SEXP splitctrl, SEXP linexpcov2sample, 
00222                         SEXP expcovinf, double *cutpoint, int *levelset, 
00223                         double *maxstat, double *statistics) {
00224 
00225     double tol, *tmpx, *tmptmpx, tmp = 0.0;
00226     int *irank, *ordertmpx, i, j, k, l, jp, chk;
00227 
00228     tol = get_tol(splitctrl);          
00229                      
00230     /* allocate memory */
00231     tmpx = Calloc(n, double);
00232     ordertmpx = Calloc(n, int);
00233     irank = Calloc(p, int);
00234     tmptmpx = Calloc(n, double);
00235 
00236     /* for all response variables (aka: dummy variables) */
00237     for (j = 0; j < q; j++) {
00238     
00239         jp = j * p;
00240 
00241         /* determine the ranking of the kth level among 
00242            the standardized statistic: This induced an ordering of the 
00243            observations */
00244         for (k = 0; k < p; k++) {
00245             irank[k] = 1;
00246             for (l = 0; l < p; l++)
00247                 if (standstat[jp + l] < standstat[jp + k]) irank[k]++;
00248         }
00249         
00250         /* a temporary response variable: the rank of the level */
00251         for (i = 0; i < n; i++) {
00252             tmpx[i] = (double) irank[codingx[i] - 1];
00253             tmptmpx[i] = tmpx[i];
00254             ordertmpx[i] = i + 1;
00255         }
00256         
00257         /* order(dtmpx) */
00258         rsort_with_index(tmptmpx, ordertmpx, n);
00259 
00260         /* search for a cutpoint (now we do have an ordering) */
00261         C_split(tmpx, 1, y, q, weights, n, ordertmpx,
00262                 splitctrl, linexpcov2sample,
00263                 expcovinf, cutpoint, maxstat, statistics);
00264 
00265         /* if we have seen an improvement: save this segmentation 
00266            note: there may be splits with equal goodness */
00267         chk = 0;
00268         if (maxstat[0] > tmp) {
00269             for (k = 0; k < p; k++) {
00270                 if (irank[k] > cutpoint[0]) {
00271                     levelset[k] = 1;
00272                     chk += 1;
00273                 } else {
00274                     levelset[k] = 0;
00275                 }
00276             }
00277             tmp = maxstat[0];
00278         }
00279         /* <FIXME> make sure that at least one level goes left,
00280                    C_split may end up with cutpoint > max(irank), why?
00281            </FIXME>
00282         */
00283         /* hm, why did I added 
00284         if (chk == 0) tmp = 0.0; 
00285         ??? */
00286     }
00287     maxstat[0] = tmp;
00288 
00289     /* free memory */
00290     Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx);
00291 }
00292 
00293 
00307 SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights, 
00308                         SEXP linexpcov2sample, SEXP linexpcov, 
00309                         SEXP expcovinf, SEXP splitctrl) {
00310              
00311     SEXP ans, cutpoint, maxstat, statistics, levelset;
00312     double *standstat;
00313 
00314     C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00315                     1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
00316 
00317     standstat = Calloc(get_dimension(linexpcov), double);
00318     C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)),
00319                   REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
00320                   REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
00321                   get_dimension(linexpcov), get_tol(splitctrl), standstat);
00322 
00323     PROTECT(ans = allocVector(VECSXP, 4));
00324     SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00325     SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00326     SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00327     SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x)));
00328     
00329     C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights), 
00330                        nrow(x), standstat, 
00331                        splitctrl, linexpcov2sample, expcovinf, 
00332                        REAL(cutpoint), INTEGER(levelset), REAL(maxstat), 
00333                        REAL(statistics));
00334 
00335     UNPROTECT(1);
00336     Free(standstat);
00337     return(ans);
00338 }

Generated on Fri Aug 25 14:30:01 2006 for party by  doxygen 1.4.4