CWB
|
#include "eval.h"
#include "corpmanag.h"
#include "output.h"
#include "../cl/bitfields.h"
#include "../cl/attributes.h"
RangeSetOp object: indicates a specific type of operation that can be applied when operating on sets of corpus positions making up a subcorpus.
More...#define ALL_LINES 1 |
delete all lines
Referenced by delete_intervals().
#define SELECTED_LINES 2 |
delete the selected lines
Referenced by copy_intervals(), delete_intervals(), and do_delete_lines_num().
#define SORT_FROM_END 1 |
#define SORT_FROM_START 0 |
#define SORT_RESET 2 |
#define UNSELECTED_LINES 3 |
delete all but the selected lines
Referenced by copy_intervals(), delete_intervals(), do_reduce(), and do_StandardQuery().
typedef enum rng_setops RangeSetOp |
RangeSetOp object: indicates a specific type of operation that can be applied when operating on sets of corpus positions making up a subcorpus.
RUnion, RDiff, and RIntersection operate on two corpora; the others operate on only one.
typedef struct _sort_clause * SortClause |
typedef struct _sort_clause SortClauseBuffer |
The SortClause object (and underlying SortClauseBuffer).
Contains information about a sort to be applied to a query.
enum rng_setops |
RangeSetOp object: indicates a specific type of operation that can be applied when operating on sets of corpus positions making up a subcorpus.
RUnion, RDiff, and RIntersection operate on two corpora; the others operate on only one.
int calculate_leftboundary | ( | CorpusList * | cl, |
int | cpos, | ||
Context | spc | ||
) |
References calculate_ranges(), left, and right.
Referenced by expand_dataspace(), and findcorpus().
int calculate_ranges | ( | CorpusList * | cl, |
int | cpos, | ||
Context | spc, | ||
int * | left, | ||
int * | right | ||
) |
References ctxtsp::attrib, False, get_bounds_of_nth_struc, get_nr_of_strucs(), get_num_of_struc, get_struc_attribute, MAX, MIN, cl::mother_size, ctxtsp::size, structure, ctxtsp::type, and word.
Referenced by calculate_leftboundary(), calculate_rightboundary(), and evaluate_target().
int calculate_rightboundary | ( | CorpusList * | cl, |
int | cpos, | ||
Context | spc | ||
) |
References calculate_ranges(), left, and right.
Referenced by expand_dataspace(), findcorpus(), and simulate().
int copy_intervals | ( | CorpusList * | cp, |
Bitfield | intervals, | ||
int | mode, | ||
char * | subcorpname | ||
) |
Copy concordance hits from a query-generated subcorpus to a (new or existing) subcorpus.
This function is not currently in use.
cp | The CorpusList indicating the query to copy from. |
intervals | A Bitfield containing a bit for each query hit, which is true if the hit is "selected", false if not. |
mode | ALL_LINES, SELECTED_LINES or UNSELECTED_LINES (indicating which lines to copy). |
subcorpname | Name for the subcorpus to which the lines are to be copied. |
References auto_save, cqpmessage(), delete_intervals(), dropcorpus(), duplicate_corpus(), BFBuf::elements, Error, False, findcorpus(), cl::mother_name, cl::name, RangeSetop(), RUnion, save_subcorpus(), cl::saved, SELECTED_LINES, cl::size, SUB, SYSTEM, toggle_bit(), cl::type, UNDEF, and UNSELECTED_LINES.
int delete_interval | ( | CorpusList * | cp, |
int | nr | ||
) |
Delete a single concordance hit from a query-generated subcorpus.
This function is not currently in use.
cp | The CorpusList indicating the query to delete from. |
nr | The index of the interval to delete (by setting its start and end values to -1). |
References cl_free, _Range::end, cl::range, RangeSetop(), RReduce, cl::size, cl::sortidx, _Range::start, SUB, and cl::type.
int delete_intervals | ( | CorpusList * | cp, |
Bitfield | intervals, | ||
int | mode | ||
) |
Delete a whole bunch of concordance hits from a query-generated subcorpus.
cp | The CorpusList indicating the query to delete from. |
intervals | A Bitfield containing a bit for each query hit, which is true if the hit is "selected", false if not. |
mode | ALL_LINES, SELECTED_LINES or UNSELECTED_LINES (indicating which lines to delete). |
References ALL_LINES, auto_save, cl_free, BFBuf::elements, _Range::end, get_bit(), cl::keywords, cl::range, RangeSetop(), RReduce, save_subcorpus(), SELECTED_LINES, cl::size, cl::sortidx, _Range::start, SUB, cl::targets, TEMP, touch_corpus(), cl::type, and UNSELECTED_LINES.
Referenced by copy_intervals(), do_delete_lines(), do_delete_lines_num(), do_reduce(), and do_StandardQuery().
void FreeSortClause | ( | SortClause | sc | ) |
Frees a SortClause object.
References _sort_clause::attribute_name, and cl_free.
int RangeSetop | ( | CorpusList * | corpus1, |
RangeSetOp | operation, | ||
CorpusList * | corpus2, | ||
Bitfield | restrictor | ||
) |
Carries out one of a set of operations on corpus1.
The operations that can be carried out are as follows:
RUnion - copy intervals from corpus2 to corpus1 (no duplicates); RIntersection - remove from corpus1 any intervals that are not also in corpus2; RDiff RMaximalMatches - remove spurious matches according to "longest" strategy; RMinimalMatches - remove spurious matches according to "shortest" strategy; RLeftMaximalMatches - remove spurious matches according to "standard" strategy; RNonOverlapping RUniq - remove duplicate intervals from corpus1; RReduce - remove intervals marked for deletion (by having the start memebr set to -1).
TODO to avopid confusion with the object, a better name for this function would be do_RangeSetOp
corpus1 | The corpus to be changed. |
operation | Specifies which operation is to be carried out. |
corpus2 | The corpus that is the second argument for this operation. Can be NULL if no corpus2 is required for operation. |
restrictor | Specifies which intervals in corpus2 are to be taken notice of versus ignored. Can be NULL. |
References cl_free, cl_malloc(), cl_realloc(), _Range::end, get_bit(), cl::keywords, cl::range, RangeSetop(), RDiff, RIntersection, RLeftMaximalMatches, RMaximalMatches, RMinimalMatches, RNonOverlapping, RReduce, rs_cp_range(), RUnion, RUniq, cl::size, cl::sortidx, _Range::start, cl::targets, and touch_corpus().
Referenced by copy_intervals(), delete_interval(), delete_intervals(), do_cut(), do_setop(), do_StandardQuery(), evaluate_subset(), expand_dataspace(), findcorpus(), prepare_Query(), RangeSetop(), and set_corpus_matchlists().
void RangeSort | ( | CorpusList * | c, |
int | mk_sortidx | ||
) |
Make sure that ranges are sorted in 'natural' order (i.e.
by start and end cpos).
This function has to be called when matching ranges are modified and may be needed when loading a query result (with "undump") that is not sorted in ascending order; with optional "mk_sortidx" flag, a sortidx corresponding to the original ordering is created.
c | The corpus (ie subcorpus/query) whose intervals ('ranges') are to be sorted. |
mk_sortidx | Boolean flag: if true a sortidx is created. |
References _RS_compare_ranges(), cl_free, cl_malloc(), cqpmessage(), Error, cl::keywords, cl::name, cl::range, cl::size, cl::sortidx, SUB, cl::targets, TEMP, cl::type, and Warning.
Referenced by do_undump(), evaluate_target(), and set_target().
int SortSubcorpus | ( | CorpusList * | cl, |
SortClause | sc, | ||
int | count_mode, | ||
struct Redir * | redir | ||
) |
Sort the (query) subcorpus specified by cl, or count frequencies of matching strings.
(Note that frequency counting and query result sorting are done via the same sorting algorithm.)
If the sort was not performed successfully, the sort index is reset to the default sort order, and the function returns false.
cl | Subcorpus designating the query to sort. |
sc | A sort clause. sc = NULL resets the sort index to the default sort order (i.e. sorted by corpus position). |
count_mode | Boolean: run the function in count frequency mode? |
redir | Redir object for where the output of string-counting is to be displayed. |
References access_corpus(), _sort_clause::anchor1, _sort_clause::anchor2, ATT_POS, _sort_clause::attribute_name, break_ties, TCorpus::charset, cl_cpos2id(), cl_cpos2str(), cl_free, cl_malloc(), cl_max_cpos(), cl_strdup(), cl_string_canonical(), cl_string_reverse(), close_stream(), cl::corpus, cqp, cqpmessage(), current_sortidx, DEFAULT_ATT_NAME, _Range::end, Error, EvaluationIsRunning, find_attribute, _sort_clause::flags, group2compare(), group_first, group_size, i2compare(), Info, insecure, install_signal_handler(), KeywordField, cl::keywords, MatchEndField, MatchField, cl::name, NoField, _sort_clause::offset1, _sort_clause::offset2, open_stream(), pretty_print, cl::range, cl::size, _sort_clause::sort_ascending, sort_id_cache, _sort_clause::sort_reverse, SortExternally(), cl::sortidx, srt_anchor1, srt_anchor2, srt_ascending, srt_end, srt_flags, srt_offset1, srt_offset2, srt_reverse, srt_start, _Range::start, Redir::stream, TargetField, cl::targets, text_size, touch_corpus(), USE_SORT_CACHE, UseExternalSorting, Warning, and which_app.
int SortSubcorpusRandomize | ( | CorpusList * | cl, |
int | seed | ||
) |
Sorts a query result in random order.
If seed > 0, a reproducible and stable ordering is generated based on the start and end corpus positions of matches (i.e. two given matches will always be sorted in the same order).
cl | Corpus-list object representing the query to sort. |
seed | Seed for the randomiser; should ideally be a prime number (2^31 is a particularly bad choice); if it is 0, then the internal RNG's standard random order is used. |
References access_corpus(), cl_free, cl_malloc(), cl_random(), cl_set_rng_state(), cqp, cqpmessage(), _Range::end, Error, EvaluationIsRunning, Info, install_signal_handler(), cl::name, random_compare(), random_sort_keys, cl::range, cl::size, cl::sortidx, _Range::start, touch_corpus(), Warning, and which_app.