CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
rapid7

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/external/source/byakugan/csv_parser.hpp
Views: 11766
1
/**
2
* csv_parser Header File
3
*
4
* This object is used to parse text documents that are delimited by some
5
* type of character. Some of the common ones use spaces, tabs, commas and semi-colons.
6
*
7
* This is a list of common characters encountered by this program
8
*
9
* This list was prepared from the data from http://www.asciitable.com
10
*
11
* @li DEC is how it would be represented in decimal form (base 10)
12
* @li HEX is how it would be represented in hexadecimal format (base 16)
13
*
14
* @li DEC HEX Character Name
15
* @li 0 0x00 null
16
* @li 9 0x09 horizontal tab
17
* @li 10 0x0A line feed, new line
18
* @li 13 0x0D carriage return
19
* @li 27 0x1B escape
20
* @li 32 0x20 space
21
* @li 33 0x21 double quote
22
* @li 39 0x27 single quote
23
* @li 44 0x2C comma
24
* @li 92 0x5C backslash
25
*
26
* @author Israel Ekpo <[email protected]>
27
*/
28
29
#ifndef CSV_PARSER_HPP_INCLUDED
30
31
#define CSV_PARSER_HPP_INCLUDED
32
33
#define LIBCSV_PARSER_MAJOR_VERSION 1
34
35
#define LIBCSV_PARSER_MINOR_VERSION 0
36
37
#define LIBCSV_PARSER_PATCH_VERSION 0
38
39
#define LIBCSV_PARSER_VERSION_NUMBER 10000
40
41
/* C++ header files */
42
#include <string>
43
#include <vector>
44
45
46
/* C header files */
47
#include <cstdio>
48
#include <cstring>
49
#include <cstdlib>
50
51
using namespace std;
52
53
/**
54
* @typedef csv_row
55
*
56
* Data structure used to represent a record.
57
*
58
* This is an alias for vector <string>
59
*/
60
typedef vector <string> csv_row;
61
62
/**
63
* @typedef csv_row_ptr
64
*
65
* Pointer to a csv_row object
66
*
67
* Expands to vector <string> *
68
*/
69
typedef csv_row * csv_row_ptr;
70
71
/**
72
* @typedef enclosure_type_t
73
*
74
* This enum type is used to set the mode in which the CSV file is parsed.
75
*
76
* @li ENCLOSURE_NONE (1) means the CSV file does not use any enclosure characters for the fields
77
* @li ENCLOSURE_REQUIRED (2) means the CSV file requires enclosure characters for all the fields
78
* @li ENCLOSURE_OPTIONAL (3) means the use of enclosure characters for the fields is optional
79
*
80
* The ENCLOSURE_TYPE_BEGIN and ENCLOSURE_TYPE_END members of this enum definition are never to be used.
81
*/
82
typedef enum
83
{
84
ENCLOSURE_TYPE_BEGIN = 0,
85
ENCLOSURE_NONE = 1,
86
ENCLOSURE_REQUIRED = 2,
87
ENCLOSURE_OPTIONAL = 3,
88
ENCLOSURE_TYPE_END
89
90
} enclosure_type_t;
91
92
/**
93
* @def CSV_PARSER_FREE_BUFFER_PTR(ptr)
94
*
95
* Used to deallocate buffer pointers
96
*
97
* It deallocates the pointer only if it is not null
98
*/
99
#define CSV_PARSER_FREE_BUFFER_PTR(ptr) \
100
if (ptr != NULL) \
101
{ \
102
free(ptr); \
103
\
104
ptr = NULL; \
105
}
106
107
/**
108
* @def CSV_PARSER_FREE_FILE_PTR(fptr)
109
*
110
* Used to close open file handles
111
*
112
* It closes the file only if it is not null
113
*/
114
#define CSV_PARSER_FREE_FILE_PTR(fptr) \
115
if (fptr != NULL) \
116
{ \
117
fclose(fptr); \
118
\
119
fptr = NULL; \
120
}
121
122
/**
123
* @class csv_parser
124
*
125
* The csv_parser object
126
*
127
* Used to parse text files to extract records and fields.
128
*
129
* We are making the following assumptions :
130
*
131
* @li The record terminator is only one character in length.
132
* @li The field terminator is only one character in length.
133
* @li The fields are enclosed by single characters, if any.
134
*
135
* @li The parser can handle documents where fields are always enclosed, not enclosed at all or optionally enclosed.
136
* @li When fields are strictly all enclosed, there is an assumption that any enclosure characters within the field are escaped by placing a backslash in front of the enclosure character.
137
*
138
* The CSV files can be parsed in 3 modes.
139
* @li (a) No enclosures
140
* @li (b) Fields always enclosed.
141
* @li (c) Fields optionally enclosed.
142
*
143
* For option (c) when the enclosure character is optional, if an enclosure character is spotted at either the beginning
144
* or the end of the string, it is assumed that the field is enclosed.
145
*
146
* The csv_parser::init() method can accept a character array as the path to the CSV file.
147
* Since it is overloaded, it can also accept a FILE pointer to a stream that is already open for reading.
148
*
149
* The set_enclosed_char() method accepts the field enclosure character as the first parameter and the enclosure mode as the second parameter which
150
* controls how the text file is going to be parsed.
151
*
152
* @see csv_parser::set_enclosed_char()
153
* @see enclosure_type_t
154
*
155
* @todo Add ability to parse files where fields/columns are terminated by strings instead of just one char.
156
* @todo Add ability to set strings where lines start by. Currently lines do not have any starting char or string.
157
* @todo Add ability to set strings where line end by. Currently lines can only end with a single char.
158
* @todo Add ability to accept other escape characters besides the backslash character 0x5C.
159
* @todo More support for improperly formatted CSV data files.
160
*
161
* @author Israel Ekpo <[email protected]>
162
*/
163
class csv_parser
164
{
165
166
public :
167
168
/**
169
* Class constructor
170
*
171
* This is the default constructor.
172
*
173
* All the internal attributes are initialized here
174
*
175
* @li The enclosure character is initialized to NULL 0x00.
176
* @li The escape character is initialized to the backslash character 0x5C.
177
* @li The field delimiter character is initialized to a comma 0x2C.
178
* @li The record delimiter character is initialized to a new line character 0x0A.
179
*
180
* @li The lengths of all the above-mentioned fields are initialized to 0,1,1 and 1 respectively.
181
* @li The number of records to ignore is set to zero.
182
* @li The more_rows internal attribute is set to false.
183
* @li The pointer to the CSV input file is initialized to NULL
184
* @li The pointer to the buffer for the file name is also initialized to NULL
185
*/
186
csv_parser() : enclosed_char(0x00), escaped_char(0x5C),
187
field_term_char(0x2C), line_term_char(0x0A),
188
enclosed_length(0U), escaped_length(1U),
189
field_term_length(1U), line_term_length(1U),
190
ignore_num_lines(0U), record_count(0U),
191
input_fp(NULL), input_filename(NULL),
192
enclosure_type(ENCLOSURE_NONE),
193
more_rows(false)
194
{ }
195
196
/**
197
* Class destructor
198
*
199
* In the class destructor the file pointer to the input CSV file is closed and
200
* the buffer to the input file name is also deallocated.
201
*
202
* @see csv_parser::input_fp
203
* @see csv_parser::input_filename
204
*/
205
~csv_parser()
206
{
207
CSV_PARSER_FREE_FILE_PTR(input_fp);
208
209
CSV_PARSER_FREE_BUFFER_PTR(input_filename);
210
}
211
212
/**
213
* Initializes the current object
214
*
215
* This init method accepts a pointer to the CSV file that has been opened for reading
216
*
217
* It also resets the file pointer to the beginning of the stream
218
*
219
* @overload bool init(FILE * input_file_pointer)
220
* @param[in] input_file_pointer
221
* @return bool Returns true on success and false on error.
222
*/
223
bool init(FILE * input_file_pointer);
224
225
/**
226
* Initializes the current object
227
*
228
* @li This init method accepts a character array as the path to the csv file.
229
* @li It sets the value of the csv_parser::input_filename property.
230
* @li Then it creates a pointer to the csv_parser::input_fp property.
231
*
232
* @overload bool init(const char * input_filename)
233
* @param[in] input_filename
234
* @return bool Returns true on success and false on error.
235
*/
236
bool init(const char * input_filename);
237
238
/**
239
* Defines the Field Enclosure character used in the Text File
240
*
241
* Setting this to NULL means that the enclosure character is optional.
242
*
243
* If the enclosure is optional, there could be fields that are enclosed, and fields that are not enclosed within the same line/record.
244
*
245
* @param[in] fields_enclosed_by The character used to enclose the fields.
246
* @param[in] enclosure_mode How the CSV file should be parsed.
247
* @return void
248
*/
249
void set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode);
250
251
/**
252
* Defines the Field Delimiter character used in the text file
253
*
254
* @param[in] fields_terminated_by
255
* @return void
256
*/
257
void set_field_term_char(char fields_terminated_by);
258
259
/**
260
* Defines the Record Terminator character used in the text file
261
*
262
* @param[in] lines_terminated_by
263
* @return void
264
*/
265
void set_line_term_char(char lines_terminated_by);
266
267
/**
268
* Returns whether there is still more data
269
*
270
* This method returns a boolean value indicating whether or not there are
271
* still more records to be extracted in the current file being parsed.
272
*
273
* Call this method to see if there are more rows to retrieve before invoking csv_parser::get_row()
274
*
275
* @see csv_parser::get_row()
276
* @see csv_parser::more_rows
277
*
278
* @return bool Returns true if there are still more rows and false if there is not.
279
*/
280
bool has_more_rows(void)
281
{
282
return more_rows;
283
}
284
285
/**
286
* Defines the number of records to discard
287
*
288
* The number of records specified will be discarded during the parsing process.
289
*
290
* @see csv_parser::_skip_lines()
291
* @see csv_parser::get_row()
292
* @see csv_parser::has_more_rows()
293
*
294
* @param[in] lines_to_skip How many records should be skipped
295
* @return void
296
*/
297
void set_skip_lines(unsigned int lines_to_skip)
298
{
299
ignore_num_lines = lines_to_skip;
300
}
301
302
/**
303
* Return the current row from the CSV file
304
*
305
* The row is returned as a vector of string objects.
306
*
307
* This method should be called only if csv_parser::has_more_rows() is true
308
*
309
* @see csv_parser::has_more_rows()
310
* @see csv_parser::get_record_count()
311
* @see csv_parser::reset_record_count()
312
* @see csv_parser::more_rows
313
*
314
* @return csv_row A vector type containing an array of strings
315
*/
316
csv_row get_row(void);
317
318
/**
319
* Returns the number of times the csv_parser::get_row() method has been invoked
320
*
321
* @see csv_parser::reset_record_count()
322
* @return unsigned int The number of times the csv_parser::get_row() method has been invoked.
323
*/
324
unsigned int get_record_count(void)
325
{
326
return record_count;
327
}
328
329
/**
330
* Resets the record_count internal attribute to zero
331
*
332
* This may be used if the object is reused multiple times.
333
*
334
* @see csv_parser::record_count
335
* @see csv_parser::get_record_count()
336
* @return void
337
*/
338
void reset_record_count(void)
339
{
340
record_count = 0U;
341
}
342
343
private :
344
345
/**
346
* Ignores N records in the CSV file
347
*
348
* Where N is the value of the csv_parser::ignore_num_lines internal property.
349
*
350
* The number of lines skipped can be defined by csv_parser::set_skip_lines()
351
*
352
* @see csv_parser::set_skip_lines()
353
*
354
* @return void
355
*/
356
void _skip_lines(void);
357
358
/**
359
* Reads a Single Line
360
*
361
* Reads a single record into the buffer passed by reference to the method
362
*
363
* @param[in,out] buffer A pointer to a character array for the current line.
364
* @param[out] buffer_len A pointer to an integer storing the length of the buffer.
365
* @return void
366
*/
367
void _read_single_line(char ** buffer, unsigned int * buffer_len);
368
369
/**
370
* Extracts the fields without enclosures
371
*
372
* This is used when the enclosure character is not set
373
* @param[out] row The vector of strings
374
* @param[in] line The character array buffer containing the current record/line
375
* @param[in] line_length The length of the buffer
376
*/
377
void _get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
378
379
/**
380
* Extracts the fields with enclosures
381
*
382
* This is used when the enclosure character is set.
383
*
384
* @param[out] row The vector of strings
385
* @param[in] line The character array buffer containing the current record/line
386
* @param[in] line_length The length of the buffer
387
*/
388
void _get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
389
390
/**
391
* Extracts the fields when enclosure is optional
392
*
393
* This is used when the enclosure character is optional
394
*
395
* Hence, there could be fields that use it, and fields that don't.
396
*
397
* @param[out] row The vector of strings
398
* @param[in] line The character array buffer containing the current record/line
399
* @param[in] line_length The length of the buffer
400
*/
401
void _get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
402
403
protected :
404
405
/**
406
* The enclosure character
407
*
408
* If present or used for a field it is assumed that both ends of the fields are wrapped.
409
*
410
* This is that single character used in the document to wrap the fields.
411
*
412
* @see csv_parser::_get_fields_without_enclosure()
413
* @see csv_parser::_get_fields_with_enclosure()
414
* @see csv_parser::_get_fields_with_optional_enclosure()
415
*
416
* @var enclosed_char
417
*/
418
char enclosed_char;
419
420
/**
421
* The escape character
422
*
423
* For now the only valid escape character allowed is the backslash character 0x5C
424
*
425
* This is only important when the enclosure character is required or optional.
426
*
427
* This is the backslash character used to escape enclosure characters found within the fields.
428
*
429
* @see csv_parser::_get_fields_with_enclosure()
430
* @see csv_parser::_get_fields_with_optional_enclosure()
431
* @todo Update the code to accept other escape characters besides the backslash
432
*
433
* @var escaped_char
434
*/
435
char escaped_char;
436
437
/**
438
* The field terminator
439
*
440
* This is the single character used to mark the end of a column in the text file.
441
*
442
* Common characters used include the comma, tab, and semi-colons.
443
*
444
* This is the single character used to separate fields within a record.
445
*
446
* @var field_term_char
447
*/
448
char field_term_char;
449
450
/**
451
* The record terminator
452
*
453
* This is the single character used to mark the end of a record in the text file.
454
*
455
* The most popular one is the new line character however it is possible to use others as well.
456
*
457
* This is the single character used to mark the end of a record
458
*
459
* @see csv_parser::get_row()
460
*
461
* @var line_term_char
462
*/
463
char line_term_char;
464
465
/**
466
* Enclosure length
467
*
468
* This is the length of the enclosure character
469
*
470
* @see csv_parser::csv_parser()
471
* @see csv_parser::set_enclosed_char()
472
*
473
* @var enclosed_length
474
*/
475
unsigned int enclosed_length;
476
477
/**
478
* The length of the escape character
479
*
480
* Right now this is really not being used.
481
*
482
* It may be used in future versions of the object.
483
*
484
* @todo Update the code to accept other escape characters besides the backslash
485
*
486
* @var escaped_length
487
*/
488
unsigned int escaped_length;
489
490
/**
491
* Length of the field terminator
492
*
493
* For now this is not being used. It will be used in future versions of the object.
494
*
495
* @var field_term_length
496
*/
497
unsigned int field_term_length;
498
499
/**
500
* Length of the record terminator
501
*
502
* For now this is not being used. It will be used in future versions of the object.
503
*
504
* @var line_term_length
505
*/
506
unsigned int line_term_length;
507
508
/**
509
* Number of records to discard
510
*
511
* This variable controls how many records in the file are skipped before parsing begins.
512
*
513
* @see csv_parser::_skip_lines()
514
* @see csv_parser::set_skip_lines()
515
*
516
* @var ignore_num_lines
517
*/
518
unsigned int ignore_num_lines;
519
520
/**
521
* Number of times the get_row() method has been called
522
*
523
* @see csv_parser::get_row()
524
* @var record_count
525
*/
526
unsigned int record_count;
527
528
/**
529
* The CSV File Pointer
530
*
531
* This is the pointer to the CSV file
532
*
533
* @var input_fp
534
*/
535
FILE * input_fp;
536
537
/**
538
* Buffer to input file name
539
*
540
* This buffer is used to store the name of the file that is being parsed
541
*
542
* @var input_filename
543
*/
544
char * input_filename;
545
546
/**
547
* Mode in which the CSV file will be parsed
548
*
549
* The various values are explained below
550
*
551
* @li ENCLOSURE_NONE (1) means the CSV file does not use any enclosure characters for the fields
552
* @li ENCLOSURE_REQUIRED (2) means the CSV file requires enclosure characters for all the fields
553
* @li ENCLOSURE_OPTIONAL (3) means the use of enclosure characters for the fields is optional
554
*
555
* @see csv_parser::get_row()
556
* @see csv_parser::_read_single_line()
557
* @see csv_parser::_get_fields_without_enclosure()
558
* @see csv_parser::_get_fields_with_enclosure()
559
* @see csv_parser::_get_fields_with_optional_enclosure()
560
*
561
* @var enclosure_type
562
*/
563
enclosure_type_t enclosure_type;
564
565
/**
566
* There are still more records to parse
567
*
568
* This boolean property is an internal indicator of whether there are still records in the
569
* file to be parsed.
570
*
571
* @see csv_parser::has_more_rows()
572
* @var more_rows
573
*/
574
bool more_rows;
575
576
}; /* class csv_parser */
577
578
#endif /* CSV_PARSER_HPP_INCLUDED */
579
580