Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/external/source/byakugan/csv_parser.cpp
Views: 11766
1/* INCLUDING HEADER FILES */23#include "csv_parser.hpp"456/* BEGIN DEFINITION FOR PUBLIC METHODS */7bool csv_parser::init(FILE * input_file_pointer)8{9input_fp = input_file_pointer;1011if (input_fp == NULL)12{13//dprintf("Fatal error : unable to open input file from file pointer\n");1415return false;16}1718/* Resetting the internal pointer to the beginning of the stream */19rewind(input_fp);2021more_rows = true;2223_skip_lines();2425return true;26}2728bool csv_parser::init(const char * input_file)29{30const size_t filename_length = strlen(input_file);3132if (!filename_length)33{34fprintf(stderr, "Fatal error : invalid input file %s\n", input_file);3536return false;37}3839input_filename = (char *) malloc(filename_length + 1);4041if (input_filename == NULL)42{43//dprintf("Fatal error : unable to allocate memory for file name buffer %s\n", input_file);4445return false;46}4748memset(input_filename, 0, filename_length + 1);4950strcpy(input_filename, input_file);5152input_fp = fopen(input_file, "r");5354if (input_fp == NULL)55{56//dprintf("Fatal error : unable to open input file %s\n", input_file);5758CSV_PARSER_FREE_BUFFER_PTR(input_filename);5960return false;61}6263more_rows = true;6465_skip_lines();6667return true;68}6970void csv_parser::set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode)71{72if (fields_enclosed_by != 0)73{74enclosed_char = fields_enclosed_by;75enclosed_length = 1U;76enclosure_type = enclosure_mode;77}78}7980void csv_parser::set_field_term_char(char fields_terminated_by)81{82if (fields_terminated_by != 0)83{84field_term_char = fields_terminated_by;85field_term_length = 1U;86}87}8889void csv_parser::set_line_term_char(char lines_terminated_by)90{91if (lines_terminated_by != 0)92{93line_term_char = lines_terminated_by;94line_term_length = 1U;95}96}9798csv_row csv_parser::get_row(void)99{100csv_row current_row;101102/* This will store the length of the buffer */103unsigned int line_length = 0U;104105/* Character array buffer for the current record */106char * line = NULL;107108/* Grab one record */109_read_single_line(&line, &line_length);110111/* Select the most suitable field extractor based on the enclosure length */112switch(enclosure_type)113{114case ENCLOSURE_NONE : /* The fields are not enclosed by any character */115_get_fields_without_enclosure(¤t_row, line, &line_length);116break;117118case ENCLOSURE_REQUIRED : /* The fields are enclosed by a character */119_get_fields_with_enclosure(¤t_row, line, &line_length);120break;121122case ENCLOSURE_OPTIONAL : /* The fields may or may not be enclosed */123_get_fields_with_optional_enclosure(¤t_row, line, &line_length);124break;125126default :127_get_fields_with_optional_enclosure(¤t_row, line, &line_length);128break;129}130131/* Deallocate the current buffer */132CSV_PARSER_FREE_BUFFER_PTR(line);133134/* Keeps track of how many times this has method has been called */135record_count++;136137return current_row;138}139140/* BEGIN DEFINITION FOR PROTECTED METHODS */141142143/* BEGIN DEFINITION FOR PRIVATE METHODS */144145void csv_parser::_skip_lines(void)146{147/* Just in case the user accidentally sets ignore_num_lines to a negative number */148unsigned int number_of_lines_to_ignore = abs((int) ignore_num_lines);149150while(has_more_rows() && number_of_lines_to_ignore)151{152const csv_row row = get_row();153154number_of_lines_to_ignore--;155}156157record_count = 0U;158}159160void csv_parser::_get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)161{162char * field = NULL;163164if (*line_length > 0)165{166field = (char *) malloc(*line_length);167168memset(field, 0, *line_length);169170register unsigned int field_start = 0U;171register unsigned int field_end = 0U;172register unsigned int char_pos = 0U;173174while(char_pos < *line_length)175{176char curr_char = line[char_pos];177178if (curr_char == field_term_char)179{180field_end = char_pos;181182const char * field_starts_at = line + field_start;183184/* Field width must exclude field delimiter characters */185const unsigned int field_width = field_end - field_start;186187/* Copy exactly field_width bytes from field_starts_at to field */188memcpy(field, field_starts_at, field_width);189190/* This must be a null-terminated character array */191field[field_width] = 0x00;192193string field_string_obj = field;194195row->push_back(field_string_obj);196197/* This is the starting point of the next field */198field_start = char_pos + 1;199200} else if (curr_char == line_term_char)201{202field_end = char_pos;203204const char * field_starts_at = line + field_start;205206/* Field width must exclude line terminating characters */207const unsigned int field_width = field_end - field_start;208209/* Copy exactly field_width bytes from field_starts_at to field */210memcpy(field, field_starts_at, field_width);211212/* This must be a null-terminated character array */213field[field_width] = 0x00;214215string field_string_obj = field;216217row->push_back(field_string_obj);218}219220/* Move to the next character in the current line */221char_pos++;222}223224/* Deallocate memory for field buffer */225CSV_PARSER_FREE_BUFFER_PTR(field);226}227}228229void csv_parser::_get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)230{231char * field = NULL;232233if (*line_length > 0)234{235field = (char *) malloc(*line_length);236237memset(field, 0, *line_length);238239register unsigned int current_state = 0U;240register unsigned int field_start = 0U;241register unsigned int field_end = 0U;242register unsigned int char_pos = 0U;243244while(char_pos < *line_length)245{246char curr_char = line[char_pos];247248if (curr_char == enclosed_char)249{250current_state++;251252/* Lets find out if the enclosure character encountered is253* a 'real' enclosure character or if it is an embedded character that254* has been escaped within the field.255*/256register char previous_char = 0x00;257258if (char_pos > 0U)259{260/* The escaped char will have to be the 2rd or later character. */261previous_char = line[char_pos - 1];262263if (previous_char == escaped_char)264{265--current_state;266}267}268269if (current_state == 1U && previous_char != escaped_char)270{271/* This marks the beginning of the column */272field_start = char_pos;273274} else if (current_state == 2U)275{276/* We have found the end of the current field */277field_end = char_pos;278279/* We do not need the enclosure characters */280const char * field_starts_at = line + field_start + 1U;281282/* Field width must exclude beginning and ending enclosure characters */283const unsigned int field_width = field_end - field_start - 1U;284285/* Copy exactly field_width bytes from field_starts_at to field */286memcpy(field, field_starts_at, field_width);287288/* This must be a null-terminated character array */289field[field_width] = 0x00;290291string field_string_obj = field;292293row->push_back(field_string_obj);294295/* Reset the state to zero value for the next field */296current_state = 0U;297}298}299300/* Move to the next character in the current line */301char_pos++;302}303304/* If no enclosures were found in this line, the entire line becomes the only field. */305if (0 == row->size())306{307string entire_line = line;308309row->push_back(entire_line);310311} else if (current_state == 1U)312{313/* The beginning enclosure character was found but314* we could not locate the closing enclosure in the current line315* So we need to copy the remainder of the line into the last field.316*/317318/* We do not need the starting enclosure character */319const char * field_starts_at = line + field_start + 1U;320321/* Field width must exclude beginning characters */322const unsigned int field_width = *line_length - field_start - 1U;323324/* Copy exactly field_width bytes from field_starts_at to field */325memcpy(field, field_starts_at, field_width);326327/* This must be a null-terminated character array */328field[field_width] = 0x00;329330string field_string_obj = field;331332row->push_back(field_string_obj);333}334335/* Release the buffer for the field */336CSV_PARSER_FREE_BUFFER_PTR(field);337}338}339340void csv_parser::_get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)341{342char * field = NULL;343344/*345* How to extract the fields, when the enclosure char is optional.346*347* This is very similar to parsing the document without enclosure but with the following conditions.348*349* If the beginning char is an enclosure character, adjust the starting position of the string by + 1.350* If the ending char is an enclosure character, adjust the ending position by -1351*/352if (*line_length > 0)353{354field = (char *) malloc(*line_length);355356memset(field, 0, *line_length);357358register unsigned int field_start = 0U;359register unsigned int field_end = 0U;360register unsigned int char_pos = 0U;361362while(char_pos < *line_length)363{364char curr_char = line[char_pos];365366if (curr_char == field_term_char)367{368field_end = char_pos;369370const char * field_starts_at = line + field_start;371372/* Field width must exclude field delimiter characters */373unsigned int field_width = field_end - field_start;374375const char line_first_char = field_starts_at[0];376const char line_final_char = field_starts_at[field_width - 1];377378/* If the enclosure char is found at either ends of the string */379unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;380unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;381382/* We do not want to have any negative or zero field widths */383field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;384385/* Copy exactly field_width bytes from field_starts_at to field */386memcpy(field, field_starts_at + first_adjustment, field_width);387388/* This must be a null-terminated character array */389field[field_width] = 0x00;390391string field_string_obj = field;392393row->push_back(field_string_obj);394395/* This is the starting point of the next field */396field_start = char_pos + 1;397398} else if (curr_char == line_term_char)399{400field_end = char_pos;401402const char * field_starts_at = line + field_start;403404/* Field width must exclude line terminating characters */405unsigned int field_width = field_end - field_start;406407const char line_first_char = field_starts_at[0];408const char line_final_char = field_starts_at[field_width - 1];409410/* If the enclosure char is found at either ends of the string */411unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;412unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;413414/* We do not want to have any negative or zero field widths */415field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;416417/* Copy exactly field_width bytes from field_starts_at to field */418memcpy(field, field_starts_at + first_adjustment, field_width);419420/* This must be a null-terminated character array */421field[field_width] = 0x00;422423string field_string_obj = field;424425row->push_back(field_string_obj);426}427428/* Move to the next character in the current line */429char_pos++;430}431432/* Deallocate memory for field buffer */433CSV_PARSER_FREE_BUFFER_PTR(field);434}435}436437void csv_parser::_read_single_line(char ** buffer, unsigned int * buffer_len)438{439long int original_pos = ftell(input_fp);440long int current_pos = original_pos;441442register int current_char = 0;443444/* Checking one character at a time until the end of a line is found */445while(true)446{447current_char = fgetc(input_fp);448449if (current_char == EOF)450{451/* We have reached the end of the file */452more_rows = false;453454break;455456} else if (current_char == line_term_char)457{458/* We have reached the end of the row */459current_pos++;460461break;462463} else {464465current_pos++;466}467}468469/* Let's try to peek one character ahead to see if we are at the end of the file */470if (more_rows)471{472current_char = fgetc(input_fp);473474more_rows = (current_char == EOF) ? false : true;475}476477/* Find out how long this row is */478const size_t length_of_row = current_pos - original_pos;479480if (length_of_row > 0)481{482*buffer_len = length_of_row * sizeof(char) + 1;483484*buffer = (char *) realloc(*buffer, *buffer_len);485486memset(*buffer, 0, *buffer_len);487488/* Reset the internal pointer to the original position */489fseek(input_fp, original_pos, SEEK_SET);490491/* Copy the contents of the line into the buffer */492fread(*buffer, 1, length_of_row, input_fp);493}494}495496497