CWB
|
This was a temporary, experuimental "fiddle" with unicode programming. More...
This was a temporary, experuimental "fiddle" with unicode programming.
#define MAX_INPUT_LINE_LENGTH 65536 |
Input buffer size: copied from cwb-encode.
Referenced by cwbci_check_line(), and main().
int cwbci_begins_with_blank | ( | char * | str | ) |
Function for inner-loop in cwbci_check_lin().
IMPORTANT NOTE: if to be used elsewhere will need adapting, because it assumes all utf8 is well-validated and that blanks will be deleted from the line, starting with the first character.
Referenced by cwbci_check_line().
void cwbci_check_line | ( | char * | line | ) |
References check_nesting, cl_free, cl_strdup(), cl_string_list_append(), cl_string_list_get(), cl_string_list_size(), cwbci_begins_with_blank(), cwbci_encoding_ok(), cwbci_file_write_abort(), cwbci_is_wordchar(), cwbci_report_error_fixable(), cwbci_report_error_unfixable(), established_number_of_p_atts, MAX_INPUT_LINE_LENGTH, output_fd, _cl_string_list::size, skip_empty_lines, strip_blanks, and xml_aware.
Referenced by main().
int cwbci_encoding_ok | ( | char * | str | ) |
checks whether the encoding of a given string is OK.
(Maybe move to the CL later?? in which case the charset should be a parameter, as a global variable cannot be assumed in all programs.) Returns boolean.
References ascii, charset, latin1, and utf8.
Referenced by cwbci_check_line().
void cwbci_file_write_abort | ( | void | ) |
convenience function with which to abort the program if file-write fails.
References input_fd, output_fd, and output_file.
Referenced by cwbci_check_line().
int cwbci_is_wordchar | ( | char | c | ) |
Referenced by cwbci_check_line().
void cwbci_parse_options | ( | int | argc, |
char ** | argv | ||
) |
Parses commandline options for cwb-check-input and sets global variables accordingly.
References charset, charset_label, check_nesting, cl_charset_from_name(), cl_charset_name_canonical(), cl_new_string_list(), cl_strdup(), cwbci_usage(), input_file, output_file, print_fixable_errors, print_unfixable_errors, silent, skip_empty_lines, strip_blanks, verbose, and xml_aware.
Referenced by main().
void cwbci_report_error_fixable | ( | char * | msg | ) |
References errors_detected, line_no, and print_fixable_errors.
Referenced by cwbci_check_line().
void cwbci_report_error_unfixable | ( | char * | msg | ) |
References errors_detected, line_no, and print_unfixable_errors.
Referenced by cwbci_check_line().
void cwbci_usage | ( | void | ) |
References progname.
Referenced by cwbci_parse_options().
int main | ( | int | argc, |
char ** | argv | ||
) |
Main function for cwb-check-input.
References cl_delete_string_list(), cl_free, cl_free_string_list(), cwbci_check_line(), cwbci_parse_options(), errors_detected, input_fd, input_file, line, line_no, MAX_INPUT_LINE_LENGTH, output_fd, output_file, and progname.
character set used for checking encoding
Referenced by add_corpus_property(), cl_new_regex(), cl_string_maptable(), cwbci_begins_with_blank(), cwbci_encoding_ok(), cwbci_parse_options(), and decode_print_xml_declaration().
char* charset_label = "ascii" |
label of character set used for checking encoding
Referenced by cwbci_parse_options().
int check_nesting = 0 |
check perfect nesting of XML?
Referenced by cwbci_check_line(), and cwbci_parse_options().
int errors_detected = 0 |
number of errors found so far
Referenced by cwbci_report_error_fixable(), cwbci_report_error_unfixable(), and main().
int established_number_of_p_atts = 0 |
first p-att line established number of tags; anything that deviates then counts as an error
Referenced by cwbci_check_line().
cl_string_list hierarchy = NULL |
string list for keeping track of the XML hierarchy
FILE* input_fd = NULL |
file handle for the input file
Referenced by cwbci_file_write_abort(), encode_get_input_line(), lexdecode_show(), and main().
char* input_file = NULL |
filename of the input file
Referenced by cwbci_parse_options(), and main().
int line_no = 0 |
line number of the line in the input file currently being checked; first == 1
Referenced by cwbci_report_error_fixable(), cwbci_report_error_unfixable(), and main().
FILE* output_fd = NULL |
file handle for the output file; also used for boolean tests on whether we are repairing or not
Referenced by cwbci_check_line(), cwbci_file_write_abort(), and main().
char* output_file = NULL |
filename of the output file
Referenced by cwbci_file_write_abort(), cwbci_parse_options(), main(), and scancorpus_parse_options().
int print_fixable_errors = 0 |
deduced from mode, silent & verbose
Referenced by cwbci_parse_options(), and cwbci_report_error_fixable().
int print_unfixable_errors = 0 |
deduced from mode, silent & verbose
Referenced by cwbci_parse_options(), and cwbci_report_error_unfixable().
char* progname = NULL |
name of the currently running program
int silent = 0 |
hide messages
int skip_empty_lines = 0 |
check for empty lines
Referenced by cwbci_check_line(), cwbci_parse_options(), encode_parse_options(), and main().
int strip_blanks = 0 |
check for leading and trailing blanks in input and token annotations?
Referenced by cwbci_check_line(), cwbci_parse_options(), encode_add_wattr_line(), encode_parse_options(), main(), and range_open().
int verbose = 0 |
show messages about fixable errors in repair mode
int xml_aware = 0 |
ignore <? and <! lines
Referenced by cwbci_check_line(), cwbci_parse_options(), encode_add_wattr_line(), encode_parse_options(), and main().