CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
rapid7

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/external/source/byakugan/csv_parser.cpp
Views: 11766
1
2
/* INCLUDING HEADER FILES */
3
4
#include "csv_parser.hpp"
5
6
7
/* BEGIN DEFINITION FOR PUBLIC METHODS */
8
bool csv_parser::init(FILE * input_file_pointer)
9
{
10
input_fp = input_file_pointer;
11
12
if (input_fp == NULL)
13
{
14
//dprintf("Fatal error : unable to open input file from file pointer\n");
15
16
return false;
17
}
18
19
/* Resetting the internal pointer to the beginning of the stream */
20
rewind(input_fp);
21
22
more_rows = true;
23
24
_skip_lines();
25
26
return true;
27
}
28
29
bool csv_parser::init(const char * input_file)
30
{
31
const size_t filename_length = strlen(input_file);
32
33
if (!filename_length)
34
{
35
fprintf(stderr, "Fatal error : invalid input file %s\n", input_file);
36
37
return false;
38
}
39
40
input_filename = (char *) malloc(filename_length + 1);
41
42
if (input_filename == NULL)
43
{
44
//dprintf("Fatal error : unable to allocate memory for file name buffer %s\n", input_file);
45
46
return false;
47
}
48
49
memset(input_filename, 0, filename_length + 1);
50
51
strcpy(input_filename, input_file);
52
53
input_fp = fopen(input_file, "r");
54
55
if (input_fp == NULL)
56
{
57
//dprintf("Fatal error : unable to open input file %s\n", input_file);
58
59
CSV_PARSER_FREE_BUFFER_PTR(input_filename);
60
61
return false;
62
}
63
64
more_rows = true;
65
66
_skip_lines();
67
68
return true;
69
}
70
71
void csv_parser::set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode)
72
{
73
if (fields_enclosed_by != 0)
74
{
75
enclosed_char = fields_enclosed_by;
76
enclosed_length = 1U;
77
enclosure_type = enclosure_mode;
78
}
79
}
80
81
void csv_parser::set_field_term_char(char fields_terminated_by)
82
{
83
if (fields_terminated_by != 0)
84
{
85
field_term_char = fields_terminated_by;
86
field_term_length = 1U;
87
}
88
}
89
90
void csv_parser::set_line_term_char(char lines_terminated_by)
91
{
92
if (lines_terminated_by != 0)
93
{
94
line_term_char = lines_terminated_by;
95
line_term_length = 1U;
96
}
97
}
98
99
csv_row csv_parser::get_row(void)
100
{
101
csv_row current_row;
102
103
/* This will store the length of the buffer */
104
unsigned int line_length = 0U;
105
106
/* Character array buffer for the current record */
107
char * line = NULL;
108
109
/* Grab one record */
110
_read_single_line(&line, &line_length);
111
112
/* Select the most suitable field extractor based on the enclosure length */
113
switch(enclosure_type)
114
{
115
case ENCLOSURE_NONE : /* The fields are not enclosed by any character */
116
_get_fields_without_enclosure(&current_row, line, &line_length);
117
break;
118
119
case ENCLOSURE_REQUIRED : /* The fields are enclosed by a character */
120
_get_fields_with_enclosure(&current_row, line, &line_length);
121
break;
122
123
case ENCLOSURE_OPTIONAL : /* The fields may or may not be enclosed */
124
_get_fields_with_optional_enclosure(&current_row, line, &line_length);
125
break;
126
127
default :
128
_get_fields_with_optional_enclosure(&current_row, line, &line_length);
129
break;
130
}
131
132
/* Deallocate the current buffer */
133
CSV_PARSER_FREE_BUFFER_PTR(line);
134
135
/* Keeps track of how many times this has method has been called */
136
record_count++;
137
138
return current_row;
139
}
140
141
/* BEGIN DEFINITION FOR PROTECTED METHODS */
142
143
144
/* BEGIN DEFINITION FOR PRIVATE METHODS */
145
146
void csv_parser::_skip_lines(void)
147
{
148
/* Just in case the user accidentally sets ignore_num_lines to a negative number */
149
unsigned int number_of_lines_to_ignore = abs((int) ignore_num_lines);
150
151
while(has_more_rows() && number_of_lines_to_ignore)
152
{
153
const csv_row row = get_row();
154
155
number_of_lines_to_ignore--;
156
}
157
158
record_count = 0U;
159
}
160
161
void csv_parser::_get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
162
{
163
char * field = NULL;
164
165
if (*line_length > 0)
166
{
167
field = (char *) malloc(*line_length);
168
169
memset(field, 0, *line_length);
170
171
register unsigned int field_start = 0U;
172
register unsigned int field_end = 0U;
173
register unsigned int char_pos = 0U;
174
175
while(char_pos < *line_length)
176
{
177
char curr_char = line[char_pos];
178
179
if (curr_char == field_term_char)
180
{
181
field_end = char_pos;
182
183
const char * field_starts_at = line + field_start;
184
185
/* Field width must exclude field delimiter characters */
186
const unsigned int field_width = field_end - field_start;
187
188
/* Copy exactly field_width bytes from field_starts_at to field */
189
memcpy(field, field_starts_at, field_width);
190
191
/* This must be a null-terminated character array */
192
field[field_width] = 0x00;
193
194
string field_string_obj = field;
195
196
row->push_back(field_string_obj);
197
198
/* This is the starting point of the next field */
199
field_start = char_pos + 1;
200
201
} else if (curr_char == line_term_char)
202
{
203
field_end = char_pos;
204
205
const char * field_starts_at = line + field_start;
206
207
/* Field width must exclude line terminating characters */
208
const unsigned int field_width = field_end - field_start;
209
210
/* Copy exactly field_width bytes from field_starts_at to field */
211
memcpy(field, field_starts_at, field_width);
212
213
/* This must be a null-terminated character array */
214
field[field_width] = 0x00;
215
216
string field_string_obj = field;
217
218
row->push_back(field_string_obj);
219
}
220
221
/* Move to the next character in the current line */
222
char_pos++;
223
}
224
225
/* Deallocate memory for field buffer */
226
CSV_PARSER_FREE_BUFFER_PTR(field);
227
}
228
}
229
230
void csv_parser::_get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
231
{
232
char * field = NULL;
233
234
if (*line_length > 0)
235
{
236
field = (char *) malloc(*line_length);
237
238
memset(field, 0, *line_length);
239
240
register unsigned int current_state = 0U;
241
register unsigned int field_start = 0U;
242
register unsigned int field_end = 0U;
243
register unsigned int char_pos = 0U;
244
245
while(char_pos < *line_length)
246
{
247
char curr_char = line[char_pos];
248
249
if (curr_char == enclosed_char)
250
{
251
current_state++;
252
253
/* Lets find out if the enclosure character encountered is
254
* a 'real' enclosure character or if it is an embedded character that
255
* has been escaped within the field.
256
*/
257
register char previous_char = 0x00;
258
259
if (char_pos > 0U)
260
{
261
/* The escaped char will have to be the 2rd or later character. */
262
previous_char = line[char_pos - 1];
263
264
if (previous_char == escaped_char)
265
{
266
--current_state;
267
}
268
}
269
270
if (current_state == 1U && previous_char != escaped_char)
271
{
272
/* This marks the beginning of the column */
273
field_start = char_pos;
274
275
} else if (current_state == 2U)
276
{
277
/* We have found the end of the current field */
278
field_end = char_pos;
279
280
/* We do not need the enclosure characters */
281
const char * field_starts_at = line + field_start + 1U;
282
283
/* Field width must exclude beginning and ending enclosure characters */
284
const unsigned int field_width = field_end - field_start - 1U;
285
286
/* Copy exactly field_width bytes from field_starts_at to field */
287
memcpy(field, field_starts_at, field_width);
288
289
/* This must be a null-terminated character array */
290
field[field_width] = 0x00;
291
292
string field_string_obj = field;
293
294
row->push_back(field_string_obj);
295
296
/* Reset the state to zero value for the next field */
297
current_state = 0U;
298
}
299
}
300
301
/* Move to the next character in the current line */
302
char_pos++;
303
}
304
305
/* If no enclosures were found in this line, the entire line becomes the only field. */
306
if (0 == row->size())
307
{
308
string entire_line = line;
309
310
row->push_back(entire_line);
311
312
} else if (current_state == 1U)
313
{
314
/* The beginning enclosure character was found but
315
* we could not locate the closing enclosure in the current line
316
* So we need to copy the remainder of the line into the last field.
317
*/
318
319
/* We do not need the starting enclosure character */
320
const char * field_starts_at = line + field_start + 1U;
321
322
/* Field width must exclude beginning characters */
323
const unsigned int field_width = *line_length - field_start - 1U;
324
325
/* Copy exactly field_width bytes from field_starts_at to field */
326
memcpy(field, field_starts_at, field_width);
327
328
/* This must be a null-terminated character array */
329
field[field_width] = 0x00;
330
331
string field_string_obj = field;
332
333
row->push_back(field_string_obj);
334
}
335
336
/* Release the buffer for the field */
337
CSV_PARSER_FREE_BUFFER_PTR(field);
338
}
339
}
340
341
void csv_parser::_get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
342
{
343
char * field = NULL;
344
345
/*
346
* How to extract the fields, when the enclosure char is optional.
347
*
348
* This is very similar to parsing the document without enclosure but with the following conditions.
349
*
350
* If the beginning char is an enclosure character, adjust the starting position of the string by + 1.
351
* If the ending char is an enclosure character, adjust the ending position by -1
352
*/
353
if (*line_length > 0)
354
{
355
field = (char *) malloc(*line_length);
356
357
memset(field, 0, *line_length);
358
359
register unsigned int field_start = 0U;
360
register unsigned int field_end = 0U;
361
register unsigned int char_pos = 0U;
362
363
while(char_pos < *line_length)
364
{
365
char curr_char = line[char_pos];
366
367
if (curr_char == field_term_char)
368
{
369
field_end = char_pos;
370
371
const char * field_starts_at = line + field_start;
372
373
/* Field width must exclude field delimiter characters */
374
unsigned int field_width = field_end - field_start;
375
376
const char line_first_char = field_starts_at[0];
377
const char line_final_char = field_starts_at[field_width - 1];
378
379
/* If the enclosure char is found at either ends of the string */
380
unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;
381
unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;
382
383
/* We do not want to have any negative or zero field widths */
384
field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;
385
386
/* Copy exactly field_width bytes from field_starts_at to field */
387
memcpy(field, field_starts_at + first_adjustment, field_width);
388
389
/* This must be a null-terminated character array */
390
field[field_width] = 0x00;
391
392
string field_string_obj = field;
393
394
row->push_back(field_string_obj);
395
396
/* This is the starting point of the next field */
397
field_start = char_pos + 1;
398
399
} else if (curr_char == line_term_char)
400
{
401
field_end = char_pos;
402
403
const char * field_starts_at = line + field_start;
404
405
/* Field width must exclude line terminating characters */
406
unsigned int field_width = field_end - field_start;
407
408
const char line_first_char = field_starts_at[0];
409
const char line_final_char = field_starts_at[field_width - 1];
410
411
/* If the enclosure char is found at either ends of the string */
412
unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;
413
unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;
414
415
/* We do not want to have any negative or zero field widths */
416
field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;
417
418
/* Copy exactly field_width bytes from field_starts_at to field */
419
memcpy(field, field_starts_at + first_adjustment, field_width);
420
421
/* This must be a null-terminated character array */
422
field[field_width] = 0x00;
423
424
string field_string_obj = field;
425
426
row->push_back(field_string_obj);
427
}
428
429
/* Move to the next character in the current line */
430
char_pos++;
431
}
432
433
/* Deallocate memory for field buffer */
434
CSV_PARSER_FREE_BUFFER_PTR(field);
435
}
436
}
437
438
void csv_parser::_read_single_line(char ** buffer, unsigned int * buffer_len)
439
{
440
long int original_pos = ftell(input_fp);
441
long int current_pos = original_pos;
442
443
register int current_char = 0;
444
445
/* Checking one character at a time until the end of a line is found */
446
while(true)
447
{
448
current_char = fgetc(input_fp);
449
450
if (current_char == EOF)
451
{
452
/* We have reached the end of the file */
453
more_rows = false;
454
455
break;
456
457
} else if (current_char == line_term_char)
458
{
459
/* We have reached the end of the row */
460
current_pos++;
461
462
break;
463
464
} else {
465
466
current_pos++;
467
}
468
}
469
470
/* Let's try to peek one character ahead to see if we are at the end of the file */
471
if (more_rows)
472
{
473
current_char = fgetc(input_fp);
474
475
more_rows = (current_char == EOF) ? false : true;
476
}
477
478
/* Find out how long this row is */
479
const size_t length_of_row = current_pos - original_pos;
480
481
if (length_of_row > 0)
482
{
483
*buffer_len = length_of_row * sizeof(char) + 1;
484
485
*buffer = (char *) realloc(*buffer, *buffer_len);
486
487
memset(*buffer, 0, *buffer_len);
488
489
/* Reset the internal pointer to the original position */
490
fseek(input_fp, original_pos, SEEK_SET);
491
492
/* Copy the contents of the line into the buffer */
493
fread(*buffer, 1, length_of_row, input_fp);
494
}
495
}
496
497