CoCalc -- csv_parser.cpp

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/external/source/byakugan/csv_parser.cpp
¹⁹⁵⁹² views
1

2
/* INCLUDING HEADER FILES */
3

4
#include "csv_parser.hpp"
5

6

7
/* BEGIN DEFINITION FOR PUBLIC METHODS */
8
bool csv_parser::init(FILE * input_file_pointer)
9
{
10
	input_fp = input_file_pointer;
11

12
	if (input_fp == NULL)
13
	{
14
		//dprintf("Fatal error : unable to open input file from file pointer\n");
15

16
		return false;
17
	}
18

19
	/* Resetting the internal pointer to the beginning of the stream */
20
	rewind(input_fp);
21

22
	more_rows = true;
23

24
	_skip_lines();
25

26
	return true;
27
}
28

29
bool csv_parser::init(const char * input_file)
30
{
31
	const size_t filename_length = strlen(input_file);
32

33
	if (!filename_length)
34
	{
35
		fprintf(stderr, "Fatal error : invalid input file %s\n", input_file);
36

37
		return false;
38
	}
39

40
	input_filename = (char *) malloc(filename_length + 1);
41

42
	if (input_filename == NULL)
43
	{
44
		//dprintf("Fatal error : unable to allocate memory for file name buffer %s\n", input_file);
45

46
		return false;
47
	}
48

49
	memset(input_filename, 0, filename_length + 1);
50

51
	strcpy(input_filename, input_file);
52

53
	input_fp = fopen(input_file, "r");
54

55
	if (input_fp == NULL)
56
	{
57
		//dprintf("Fatal error : unable to open input file %s\n", input_file);
58

59
		CSV_PARSER_FREE_BUFFER_PTR(input_filename);
60

61
		return false;
62
	}
63

64
	more_rows = true;
65

66
	_skip_lines();
67

68
	return true;
69
}
70

71
void csv_parser::set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode)
72
{
73
	if (fields_enclosed_by != 0)
74
	{
75
		enclosed_char   = fields_enclosed_by;
76
		enclosed_length = 1U;
77
		enclosure_type  = enclosure_mode;
78
	}
79
}
80

81
void csv_parser::set_field_term_char(char fields_terminated_by)
82
{
83
	if (fields_terminated_by != 0)
84
	{
85
		field_term_char   = fields_terminated_by;
86
		field_term_length = 1U;
87
	}
88
}
89

90
void csv_parser::set_line_term_char(char lines_terminated_by)
91
{
92
	if (lines_terminated_by != 0)
93
	{
94
		line_term_char   = lines_terminated_by;
95
		line_term_length = 1U;
96
	}
97
}
98

99
csv_row csv_parser::get_row(void)
100
{
101
	csv_row current_row;
102

103
	/* This will store the length of the buffer */
104
	unsigned int line_length = 0U;
105

106
	/* Character array buffer for the current record */
107
	char * line = NULL;
108

109
	/* Grab one record */
110
	_read_single_line(&line, &line_length);
111

112
	/* Select the most suitable field extractor based on the enclosure length */
113
	switch(enclosure_type)
114
	{
115
		case ENCLOSURE_NONE : 	 /* The fields are not enclosed by any character */
116
			_get_fields_without_enclosure(&current_row, line, &line_length);
117
		break;
118

119
		case ENCLOSURE_REQUIRED : /* The fields are enclosed by a character */
120
			_get_fields_with_enclosure(&current_row, line, &line_length);
121
		break;
122

123
		case ENCLOSURE_OPTIONAL : /* The fields may or may not be enclosed */
124
			_get_fields_with_optional_enclosure(&current_row, line, &line_length);
125
		break;
126

127
		default :
128
			_get_fields_with_optional_enclosure(&current_row, line, &line_length);
129
		break;
130
	}
131

132
	/* Deallocate the current buffer */
133
	CSV_PARSER_FREE_BUFFER_PTR(line);
134

135
	/* Keeps track of how many times this has method has been called */
136
	record_count++;
137

138
	return current_row;
139
}
140

141
/* BEGIN DEFINITION FOR PROTECTED METHODS */
142

143

144
/* BEGIN DEFINITION FOR PRIVATE METHODS */
145

146
void csv_parser::_skip_lines(void)
147
{
148
	/* Just in case the user accidentally sets ignore_num_lines to a negative number */
149
	unsigned int number_of_lines_to_ignore = abs((int) ignore_num_lines);
150

151
	while(has_more_rows() && number_of_lines_to_ignore)
152
	{
153
		const csv_row row = get_row();
154

155
		number_of_lines_to_ignore--;
156
	}
157

158
	record_count = 0U;
159
}
160

161
void csv_parser::_get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
162
{
163
	char * field = NULL;
164

165
	if (*line_length > 0)
166
	{
167
		field = (char *) malloc(*line_length);
168

169
		memset(field, 0, *line_length);
170

171
		register unsigned int field_start   = 0U;
172
		register unsigned int field_end     = 0U;
173
		register unsigned int char_pos 		= 0U;
174

175
		while(char_pos < *line_length)
176
		{
177
			char curr_char = line[char_pos];
178

179
			if (curr_char == field_term_char)
180
			{
181
				field_end = char_pos;
182

183
				const char * field_starts_at = line + field_start;
184

185
				/* Field width must exclude field delimiter characters */
186
				const unsigned int field_width = field_end - field_start;
187

188
				/* Copy exactly field_width bytes from field_starts_at to field */
189
				memcpy(field, field_starts_at, field_width);
190

191
				/* This must be a null-terminated character array */
192
				field[field_width] = 0x00;
193

194
				string field_string_obj = field;
195

196
				row->push_back(field_string_obj);
197

198
				/* This is the starting point of the next field */
199
				field_start = char_pos + 1;
200

201
			} else if (curr_char == line_term_char)
202
			{
203
				field_end = char_pos;
204

205
				const char * field_starts_at = line + field_start;
206

207
				/* Field width must exclude line terminating characters */
208
				const unsigned int field_width = field_end - field_start;
209

210
				/* Copy exactly field_width bytes from field_starts_at to field */
211
				memcpy(field, field_starts_at, field_width);
212

213
				/* This must be a null-terminated character array */
214
				field[field_width] = 0x00;
215

216
				string field_string_obj = field;
217

218
				row->push_back(field_string_obj);
219
			}
220

221
			/* Move to the next character in the current line */
222
			char_pos++;
223
		}
224

225
		/* Deallocate memory for field buffer */
226
		CSV_PARSER_FREE_BUFFER_PTR(field);
227
	}
228
}
229

230
void csv_parser::_get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
231
{
232
	char * field = NULL;
233

234
	if (*line_length > 0)
235
	{
236
		field = (char *) malloc(*line_length);
237

238
		memset(field, 0, *line_length);
239

240
		register unsigned int current_state = 0U;
241
		register unsigned int field_start   = 0U;
242
		register unsigned int field_end     = 0U;
243
		register unsigned int char_pos 		= 0U;
244

245
		while(char_pos < *line_length)
246
		{
247
			char curr_char = line[char_pos];
248

249
			if (curr_char == enclosed_char)
250
			{
251
				current_state++;
252

253
				/* Lets find out if the enclosure character encountered is
254
				 * a 'real' enclosure character or if it is an embedded character that
255
				 * has been escaped within the field.
256
				 */
257
				register char previous_char = 0x00;
258

259
				if (char_pos > 0U)
260
				{
261
					/* The escaped char will have to be the 2rd or later character. */
262
					previous_char = line[char_pos - 1];
263

264
					if (previous_char == escaped_char)
265
					{
266
						--current_state;
267
					}
268
				}
269

270
				if (current_state == 1U && previous_char != escaped_char)
271
				{
272
					/* This marks the beginning of the column */
273
					field_start = char_pos;
274

275
				} else if (current_state == 2U)
276
				{
277
					/* We have found the end of the current field */
278
					field_end = char_pos;
279

280
					/* We do not need the enclosure characters */
281
					const char * field_starts_at = line + field_start + 1U;
282

283
					/* Field width must exclude beginning and ending enclosure characters */
284
					const unsigned int field_width = field_end - field_start - 1U;
285

286
					/* Copy exactly field_width bytes from field_starts_at to field */
287
					memcpy(field, field_starts_at, field_width);
288

289
					/* This must be a null-terminated character array */
290
					field[field_width] = 0x00;
291

292
					string field_string_obj = field;
293

294
					row->push_back(field_string_obj);
295

296
					/* Reset the state to zero value for the next field */
297
					current_state = 0U;
298
				}
299
			}
300

301
			/* Move to the next character in the current line */
302
			char_pos++;
303
		}
304

305
		/* If no enclosures were found in this line, the entire line becomes the only field. */
306
		if (0 == row->size())
307
		{
308
			string entire_line = line;
309

310
			row->push_back(entire_line);
311

312
		} else if (current_state == 1U)
313
		{
314
			/* The beginning enclosure character was found but
315
			 * we could not locate the closing enclosure in the current line
316
			 * So we need to copy the remainder of the line into the last field.
317
			 */
318

319
			/* We do not need the starting enclosure character */
320
			const char * field_starts_at = line + field_start + 1U;
321

322
			/* Field width must exclude beginning characters */
323
			const unsigned int field_width = *line_length - field_start - 1U;
324

325
			/* Copy exactly field_width bytes from field_starts_at to field */
326
			memcpy(field, field_starts_at, field_width);
327

328
			/* This must be a null-terminated character array */
329
			field[field_width] = 0x00;
330

331
			string field_string_obj = field;
332

333
			row->push_back(field_string_obj);
334
		}
335

336
		/* Release the buffer for the field */
337
		CSV_PARSER_FREE_BUFFER_PTR(field);
338
	}
339
}
340

341
void csv_parser::_get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length)
342
{
343
	char * field = NULL;
344

345
	/*
346
	 * How to extract the fields, when the enclosure char is optional.
347
	 *
348
	 * This is very similar to parsing the document without enclosure but with the following conditions.
349
	 *
350
	 * If the beginning char is an enclosure character, adjust the starting position of the string by + 1.
351
	 * If the ending char is an enclosure character, adjust the ending position by -1
352
	 */
353
	if (*line_length > 0)
354
	{
355
		field = (char *) malloc(*line_length);
356

357
		memset(field, 0, *line_length);
358

359
		register unsigned int field_start   = 0U;
360
		register unsigned int field_end     = 0U;
361
		register unsigned int char_pos 		= 0U;
362

363
		while(char_pos < *line_length)
364
		{
365
			char curr_char = line[char_pos];
366

367
			if (curr_char == field_term_char)
368
			{
369
				field_end = char_pos;
370

371
				const char * field_starts_at = line + field_start;
372

373
				/* Field width must exclude field delimiter characters */
374
				unsigned int field_width = field_end - field_start;
375

376
				const char line_first_char = field_starts_at[0];
377
				const char line_final_char = field_starts_at[field_width - 1];
378

379
				/* If the enclosure char is found at either ends of the string */
380
				unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;
381
				unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;
382

383
				/* We do not want to have any negative or zero field widths */
384
				field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;
385

386
				/* Copy exactly field_width bytes from field_starts_at to field */
387
				memcpy(field, field_starts_at + first_adjustment, field_width);
388

389
				/* This must be a null-terminated character array */
390
				field[field_width] = 0x00;
391

392
				string field_string_obj = field;
393

394
				row->push_back(field_string_obj);
395

396
				/* This is the starting point of the next field */
397
				field_start = char_pos + 1;
398

399
			} else if (curr_char == line_term_char)
400
			{
401
				field_end = char_pos;
402

403
				const char * field_starts_at = line + field_start;
404

405
				/* Field width must exclude line terminating characters */
406
				unsigned int field_width = field_end - field_start;
407

408
				const char line_first_char = field_starts_at[0];
409
				const char line_final_char = field_starts_at[field_width - 1];
410

411
				/* If the enclosure char is found at either ends of the string */
412
				unsigned int first_adjustment = (line_first_char == enclosed_char) ? 1U : 0U;
413
				unsigned int final_adjustment = (line_final_char == enclosed_char) ? 2U : 0U;
414

415
				/* We do not want to have any negative or zero field widths */
416
				field_width = (field_width > 2U) ? (field_width - final_adjustment) : field_width;
417

418
				/* Copy exactly field_width bytes from field_starts_at to field */
419
				memcpy(field, field_starts_at + first_adjustment, field_width);
420

421
				/* This must be a null-terminated character array */
422
				field[field_width] = 0x00;
423

424
				string field_string_obj = field;
425

426
				row->push_back(field_string_obj);
427
			}
428

429
			/* Move to the next character in the current line */
430
			char_pos++;
431
		}
432

433
		/* Deallocate memory for field buffer */
434
		CSV_PARSER_FREE_BUFFER_PTR(field);
435
	}
436
}
437

438
void csv_parser::_read_single_line(char ** buffer, unsigned int * buffer_len)
439
{
440
	long int original_pos = ftell(input_fp);
441
	long int current_pos  = original_pos;
442

443
	register int current_char = 0;
444

445
	/* Checking one character at a time until the end of a line is found */
446
	while(true)
447
	{
448
		current_char = fgetc(input_fp);
449

450
		if (current_char == EOF)
451
		{
452
			/* We have reached the end of the file */
453
			more_rows = false;
454

455
			break;
456

457
		} else if (current_char == line_term_char)
458
		{
459
			/* We have reached the end of the row */
460
			current_pos++;
461

462
			break;
463

464
		} else {
465

466
			current_pos++;
467
		}
468
	}
469

470
	/* Let's try to peek one character ahead to see if we are at the end of the file */
471
	if (more_rows)
472
	{
473
		current_char = fgetc(input_fp);
474

475
		more_rows = (current_char == EOF) ? false : true;
476
	}
477

478
	/* Find out how long this row is */
479
	const size_t length_of_row = current_pos - original_pos;
480

481
	if (length_of_row > 0)
482
	{
483
		*buffer_len = length_of_row * sizeof(char) + 1;
484

485
		*buffer = (char *) realloc(*buffer, *buffer_len);
486

487
		memset(*buffer, 0, *buffer_len);
488

489
		/* Reset the internal pointer to the original position */
490
		fseek(input_fp, original_pos, SEEK_SET);
491

492
		/* Copy the contents of the line into the buffer */
493
		fread(*buffer, 1, length_of_row, input_fp);
494
	}
495
}
496

497
Product

Resources

Company