CWB
Defines | Functions | Variables

ranges.c File Reference

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include "../cl/globals.h"
#include "../cl/corpus.h"
#include "../cl/attributes.h"
#include "../cl/cdaccess.h"
#include "../cl/macros.h"
#include "corpmanag.h"
#include "eval.h"
#include "ranges.h"
#include "output.h"
#include "matchlist.h"
#include "options.h"

Defines

Functions

Variables


Define Documentation

#define SORT_DEBUG   0

Referenced by i2compare(), and SortExternally().

#define spaceship (   A,
 
)    ((A) > (B)) ? 1 : ((A) < (B)) ? -1 : 0

Referenced by random_compare().

#define USE_SORT_CACHE

Defined if a sort cache is to be used in sorting concordance lines.

The sort cache (caching the lexicon IDs of the first two tokens to be compared in each line) is indispensable for the internal sorting algorithm since random accesses to a compressed corpus are painfully slow; when sorting variable length matches such as German NPs on word forms, the current implementation has a hit rate of around 99%.

Referenced by SortSubcorpus().


Function Documentation

int _RS_compare_ranges ( const void *  pa,
const void *  pb 
)

qsort() helper function for RangeSort() below

References _Range::end, and _Range::start.

Referenced by RangeSort().

int calculate_leftboundary ( CorpusList cl,
int  cpos,
Context  spc 
)

References calculate_ranges(), left, and right.

Referenced by expand_dataspace(), and findcorpus().

int calculate_ranges ( CorpusList cl,
int  cpos,
Context  spc,
int *  left,
int *  right 
)
int calculate_rightboundary ( CorpusList cl,
int  cpos,
Context  spc 
)

References calculate_ranges(), left, and right.

Referenced by expand_dataspace(), findcorpus(), and simulate().

int copy_intervals ( CorpusList cp,
Bitfield  intervals,
int  mode,
char *  subcorpname 
)

Copy concordance hits from a query-generated subcorpus to a (new or existing) subcorpus.

This function is not currently in use.

Parameters:
cpThe CorpusList indicating the query to copy from.
intervalsA Bitfield containing a bit for each query hit, which is true if the hit is "selected", false if not.
modeALL_LINES, SELECTED_LINES or UNSELECTED_LINES (indicating which lines to copy).
subcorpnameName for the subcorpus to which the lines are to be copied.
Returns:
Boolean: true for success, false for failure.

References auto_save, cqpmessage(), delete_intervals(), dropcorpus(), duplicate_corpus(), BFBuf::elements, Error, False, findcorpus(), cl::mother_name, cl::name, RangeSetop(), RUnion, save_subcorpus(), cl::saved, SELECTED_LINES, cl::size, SUB, SYSTEM, toggle_bit(), cl::type, UNDEF, and UNSELECTED_LINES.

int delete_interval ( CorpusList cp,
int  nr 
)

Delete a single concordance hit from a query-generated subcorpus.

This function is not currently in use.

Parameters:
cpThe CorpusList indicating the query to delete from.
nrThe index of the interval to delete (by setting its start and end values to -1).
Returns:
Boolean: true for success, false for failure.

References cl_free, _Range::end, cl::range, RangeSetop(), RReduce, cl::size, cl::sortidx, _Range::start, SUB, and cl::type.

int delete_intervals ( CorpusList cp,
Bitfield  intervals,
int  mode 
)

Delete a whole bunch of concordance hits from a query-generated subcorpus.

Parameters:
cpThe CorpusList indicating the query to delete from.
intervalsA Bitfield containing a bit for each query hit, which is true if the hit is "selected", false if not.
modeALL_LINES, SELECTED_LINES or UNSELECTED_LINES (indicating which lines to delete).
Returns:
Boolean: true for success, false for failure.

References ALL_LINES, auto_save, cl_free, BFBuf::elements, _Range::end, get_bit(), cl::keywords, cl::range, RangeSetop(), RReduce, save_subcorpus(), SELECTED_LINES, cl::size, cl::sortidx, _Range::start, SUB, cl::targets, TEMP, touch_corpus(), cl::type, and UNSELECTED_LINES.

Referenced by copy_intervals(), do_delete_lines(), do_delete_lines_num(), do_reduce(), and do_StandardQuery().

void FreeSortClause ( SortClause  sc)

Frees a SortClause object.

References _sort_clause::attribute_name, and cl_free.

static int group2compare ( const void *  vidx1,
const void *  vidx2 
) [static]

Compares two groups of equivalent matches by group sizes (descending), breaking ties through i2compare.

References current_sortidx, EvaluationIsRunning, group_first, group_size, i2compare(), s1, and s2.

Referenced by SortSubcorpus().

static int i2compare ( const void *  vidx1,
const void *  vidx2 
) [static]

Compare two matches according to current sort settings in static variables (qsort callback used in query result sorting).

This is the primary query-hit-comparison function. It wraps cl_string_qsort_compare for string comparison, but does much more as well, because we are not just comparing individual strings but rather, potentially, whole bundles of strings from various different positions.

Parameters:
vidx1Pointer to the integer index of the first of the intervals to be compared (ie an index into an array of start/end positions).
vidx2Pointer to the integer index of the second of the intervals to be compared.
Returns:
Usual returns for qsort callbacks.

References break_ties, TCorpus::charset, cl_cpos2id(), cl_id2str(), cl_string_qsort_compare(), cl::corpus, EvaluationIsRunning, MIN, s1, s2, SORT_DEBUG, sort_id_cache, srt_ascending, srt_end, srt_flags, srt_reverse, and srt_start.

Referenced by group2compare(), and SortSubcorpus().

static int random_compare ( const void *  vidx1,
const void *  vidx2 
) [static]

Sorts hits in random order by comparing random numbers in the vector random_sort_keys[], breaking ties by start and end positions of matches (from *srt_cl) in order to ensure stable sorting.

This is another qsort callback function.

References _Range::end, random_sort_keys, cl::range, spaceship, and _Range::start.

Referenced by SortSubcorpusRandomize().

int RangeSetop ( CorpusList corpus1,
RangeSetOp  operation,
CorpusList corpus2,
Bitfield  restrictor 
)

Carries out one of a set of operations on corpus1.

The operations that can be carried out are as follows:

RUnion - copy intervals from corpus2 to corpus1 (no duplicates); RIntersection - remove from corpus1 any intervals that are not also in corpus2; RDiff RMaximalMatches - remove spurious matches according to "longest" strategy; RMinimalMatches - remove spurious matches according to "shortest" strategy; RLeftMaximalMatches - remove spurious matches according to "standard" strategy; RNonOverlapping RUniq - remove duplicate intervals from corpus1; RReduce - remove intervals marked for deletion (by having the start memebr set to -1).

TODO to avopid confusion with the object, a better name for this function would be do_RangeSetOp

Parameters:
corpus1The corpus to be changed.
operationSpecifies which operation is to be carried out.
corpus2The corpus that is the second argument for this operation. Can be NULL if no corpus2 is required for operation.
restrictorSpecifies which intervals in corpus2 are to be taken notice of versus ignored. Can be NULL.
Returns:
Boolean, true for all OK, otherwise false.

References cl_free, cl_malloc(), cl_realloc(), _Range::end, get_bit(), cl::keywords, cl::range, RangeSetop(), RDiff, RIntersection, RLeftMaximalMatches, RMaximalMatches, RMinimalMatches, RNonOverlapping, RReduce, rs_cp_range(), RUnion, RUniq, cl::size, cl::sortidx, _Range::start, cl::targets, and touch_corpus().

Referenced by copy_intervals(), delete_interval(), delete_intervals(), do_cut(), do_setop(), do_StandardQuery(), evaluate_subset(), expand_dataspace(), findcorpus(), prepare_Query(), RangeSetop(), and set_corpus_matchlists().

void RangeSort ( CorpusList c,
int  mk_sortidx 
)

Make sure that ranges are sorted in 'natural' order (i.e.

by start and end cpos).

This function has to be called when matching ranges are modified and may be needed when loading a query result (with "undump") that is not sorted in ascending order; with optional "mk_sortidx" flag, a sortidx corresponding to the original ordering is created.

Parameters:
cThe corpus (ie subcorpus/query) whose intervals ('ranges') are to be sorted.
mk_sortidxBoolean flag: if true a sortidx is created.

References _RS_compare_ranges(), cl_free, cl_malloc(), cqpmessage(), Error, cl::keywords, cl::name, cl::range, cl::size, cl::sortidx, SUB, cl::targets, TEMP, cl::type, and Warning.

Referenced by do_undump(), evaluate_target(), and set_target().

void rs_cp_range ( Range rng,
int *  target,
int *  keyword,
int  ins,
CorpusList corpus,
int  j 
)

this is a rather specialised utility function for the UNION part of RangeSetop() (copies range + keyword/target (if defined) in corpus into temporary lists)

References _Range::end, cl::keywords, cl::range, _Range::start, and cl::targets.

Referenced by RangeSetop().

int SortExternally ( void  )
int SortSubcorpus ( CorpusList cl,
SortClause  sc,
int  count_mode,
struct Redir redir 
)

Sort the (query) subcorpus specified by cl, or count frequencies of matching strings.

(Note that frequency counting and query result sorting are done via the same sorting algorithm.)

If the sort was not performed successfully, the sort index is reset to the default sort order, and the function returns false.

Parameters:
clSubcorpus designating the query to sort.
scA sort clause. sc = NULL resets the sort index to the default sort order (i.e. sorted by corpus position).
count_modeBoolean: run the function in count frequency mode?
redirRedir object for where the output of string-counting is to be displayed.
Returns:
Boolean: true for successful sort, false for unsuccessful.

References access_corpus(), _sort_clause::anchor1, _sort_clause::anchor2, ATT_POS, _sort_clause::attribute_name, break_ties, TCorpus::charset, cl_cpos2id(), cl_cpos2str(), cl_free, cl_malloc(), cl_max_cpos(), cl_strdup(), cl_string_canonical(), cl_string_reverse(), close_stream(), cl::corpus, cqp, cqpmessage(), current_sortidx, DEFAULT_ATT_NAME, _Range::end, Error, EvaluationIsRunning, find_attribute, _sort_clause::flags, group2compare(), group_first, group_size, i2compare(), Info, insecure, install_signal_handler(), KeywordField, cl::keywords, MatchEndField, MatchField, cl::name, NoField, _sort_clause::offset1, _sort_clause::offset2, open_stream(), pretty_print, cl::range, cl::size, _sort_clause::sort_ascending, sort_id_cache, _sort_clause::sort_reverse, SortExternally(), cl::sortidx, srt_anchor1, srt_anchor2, srt_ascending, srt_end, srt_flags, srt_offset1, srt_offset2, srt_reverse, srt_start, _Range::start, Redir::stream, TargetField, cl::targets, text_size, touch_corpus(), USE_SORT_CACHE, UseExternalSorting, Warning, and which_app.

int SortSubcorpusRandomize ( CorpusList cl,
int  seed 
)

Sorts a query result in random order.

If seed > 0, a reproducible and stable ordering is generated based on the start and end corpus positions of matches (i.e. two given matches will always be sorted in the same order).

Parameters:
clCorpus-list object representing the query to sort.
seedSeed for the randomiser; should ideally be a prime number (2^31 is a particularly bad choice); if it is 0, then the internal RNG's standard random order is used.

References access_corpus(), cl_free, cl_malloc(), cl_random(), cl_set_rng_state(), cqp, cqpmessage(), _Range::end, Error, EvaluationIsRunning, Info, install_signal_handler(), cl::name, random_compare(), random_sort_keys, cl::range, cl::size, cl::sortidx, _Range::start, touch_corpus(), Warning, and which_app.

static int srt_strcmp ( unsigned char *  s1,
unsigned char *  s2,
unsigned char *  maptable,
int  reverse 
) [static]

String comparison function used in query result sorting.

This is NOT unicode compatible - it uses a maptable, which is baaaaad, mmkay.

This function can be deleted once it has been checked that all calls to it have been replaced with cl_string_qsort_compare

References MIN, and reverse.


Variable Documentation

Range* _RS_range = NULL

Variable used by _RS_compare_ranges; global so data can be passed in without going through that function's parameter list!

int break_ties [static]

whether to break ties (by comparison without cd flags, and by line number in the last instance)

Referenced by i2compare(), and SortSubcorpus().

int* current_sortidx [static]

alias to newly created sortidx, so it can be accessed by the callback function

Referenced by group2compare(), and SortSubcorpus().

int* group_first [static]

first match for each group of identical (or equivalent) sort strings

Referenced by group2compare(), and SortSubcorpus().

int* group_size [static]

number of matches for each group of identical (or equivalent) sort strings

Referenced by group2compare(), and SortSubcorpus().

unsigned int* random_sort_keys [static]

random keys for randomized sort order (ties are broken by cpos of matches)

Referenced by random_compare(), and SortSubcorpusRandomize().

int* sort_id_cache = NULL [static]

Pointer to the sort cache.

See also:
USE_SORT_CACHE

Referenced by i2compare(), and SortSubcorpus().

In a query sort, indicates the field type of the start of sort region.

Referenced by SortExternally(), and SortSubcorpus().

In a query sort, indicates the field type of the end of sort region.

Referenced by SortExternally(), and SortSubcorpus().

int srt_ascending [static]

boolean: sort query into ascending order or not

Referenced by i2compare(), SortExternally(), and SortSubcorpus().

The )p-)Attribute on which a query is to be sorted.

CorpusList* srt_cl [static]

The CorpusList object representing a query to be sorted.

int* srt_end [static]

When sorting a query, this contains end positions of intervals to be sorted.

Referenced by i2compare(), and SortSubcorpus().

int srt_flags [static]

Whether to use the c and/or d flags when sorting a query.

Referenced by i2compare(), SortExternally(), and SortSubcorpus().

int srt_offset1 [static]

In a query sort, indicates the offset of the start of sort region.

Referenced by SortExternally(), and SortSubcorpus().

int srt_offset2 [static]

In a query sort, indicates the offset of the end of sort region.

Referenced by SortExternally(), and SortSubcorpus().

int srt_reverse [static]

boolean: sort query on reversed-character-sequence strings (and reversed sequences OF strings) or not

Referenced by i2compare(), SortExternally(), and SortSubcorpus().

int* srt_start [static]

When sorting a query, this contains start positions of intervals to be sorted.

Referenced by i2compare(), and SortSubcorpus().

int text_size [static]

When sorting a query - this represents the size of the corpus the query belongs to.

Referenced by compose_kwic_line(), SortExternally(), and SortSubcorpus().