Path: blob/trunk/third_party/closure/goog/labs/format/csv.js
2868 views
// Copyright 2012 The Closure Library Authors. All Rights Reserved.1//2// Licensed under the Apache License, Version 2.0 (the "License");3// you may not use this file except in compliance with the License.4// You may obtain a copy of the License at5//6// http://www.apache.org/licenses/LICENSE-2.07//8// Unless required by applicable law or agreed to in writing, software9// distributed under the License is distributed on an "AS-IS" BASIS,10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.11// See the License for the specific language governing permissions and12// limitations under the License.1314/**15* @fileoverview Provides a parser that turns a string of well-formed CSV data16* into an array of objects or an array of arrays. All values are returned as17* strings; the user has to convert data into numbers or Dates as required.18* Empty fields (adjacent commas) are returned as empty strings.19*20* This parser uses http://tools.ietf.org/html/rfc4180 as the definition of CSV.21*22* @author [email protected] (Nathan Naze) Ported to Closure23*/24goog.provide('goog.labs.format.csv');25goog.provide('goog.labs.format.csv.ParseError');26goog.provide('goog.labs.format.csv.Token');2728goog.require('goog.array');29goog.require('goog.asserts');30goog.require('goog.debug.Error');31goog.require('goog.object');32goog.require('goog.string');33goog.require('goog.string.newlines');343536/**37* @define {boolean} Enable verbose debugging. This is a flag so it can be38* enabled in production if necessary post-compilation. Otherwise, debug39* information will be stripped to minimize final code size.40*/41goog.labs.format.csv.ENABLE_VERBOSE_DEBUGGING = goog.DEBUG;42434445/**46* Error thrown when parsing fails.47*48* @param {string} text The CSV source text being parsed.49* @param {number} index The index, in the string, of the position of the50* error.51* @param {string=} opt_message A description of the violated parse expectation.52* @constructor53* @extends {goog.debug.Error}54* @final55*/56goog.labs.format.csv.ParseError = function(text, index, opt_message) {5758var message;5960/**61* @type {?{line: number, column: number}} The line and column of the parse62* error.63*/64this.position = null;6566if (goog.labs.format.csv.ENABLE_VERBOSE_DEBUGGING) {67message = opt_message || '';6869var info = goog.labs.format.csv.ParseError.findLineInfo_(text, index);70if (info) {71var lineNumber = info.lineIndex + 1;72var columnNumber = index - info.line.startLineIndex + 1;7374this.position = {line: lineNumber, column: columnNumber};7576message +=77goog.string.subs(' at line %s column %s', lineNumber, columnNumber);78message += '\n' +79goog.labs.format.csv.ParseError.getLineDebugString_(80info.line.getContent(), columnNumber);81}82}8384goog.labs.format.csv.ParseError.base(this, 'constructor', message);85};86goog.inherits(goog.labs.format.csv.ParseError, goog.debug.Error);878889/** @inheritDoc */90goog.labs.format.csv.ParseError.prototype.name = 'ParseError';919293/**94* Calculate the line and column for an index in a string.95* TODO(nnaze): Consider moving to goog.string.newlines.96* @param {string} str A string.97* @param {number} index An index into the string.98* @return {?{line: !goog.string.newlines.Line, lineIndex: number}} The line99* and index of the line.100* @private101*/102goog.labs.format.csv.ParseError.findLineInfo_ = function(str, index) {103var lines = goog.string.newlines.getLines(str);104var lineIndex = goog.array.findIndex(lines, function(line) {105return line.startLineIndex <= index && line.endLineIndex > index;106});107108if (goog.isNumber(lineIndex)) {109var line = lines[lineIndex];110return {line: line, lineIndex: lineIndex};111}112113return null;114};115116117/**118* Get a debug string of a line and a pointing caret beneath it.119* @param {string} str The string.120* @param {number} column The column to point at (1-indexed).121* @return {string} The debug line.122* @private123*/124goog.labs.format.csv.ParseError.getLineDebugString_ = function(str, column) {125var returnString = str + '\n';126returnString += goog.string.repeat(' ', column - 1) + '^';127return returnString;128};129130131/**132* A token -- a single-character string or a sentinel.133* @typedef {string|!goog.labs.format.csv.Sentinels_}134*/135goog.labs.format.csv.Token;136137138/**139* Parses a CSV string to create a two-dimensional array.140*141* This function does not process header lines, etc -- such transformations can142* be made on the resulting array.143*144* @param {string} text The entire CSV text to be parsed.145* @param {boolean=} opt_ignoreErrors Whether to ignore parsing errors and146* instead try to recover and keep going.147* @param {string=} opt_delimiter The delimiter to use. Defaults to ','148* @return {!Array<!Array<string>>} The parsed CSV.149*/150goog.labs.format.csv.parse = function(text, opt_ignoreErrors, opt_delimiter) {151152var index = 0; // current char offset being considered153154var delimiter = opt_delimiter || ',';155goog.asserts.assert(156delimiter.length == 1, 'Delimiter must be a single character.');157goog.asserts.assert(158delimiter != '\r' && opt_delimiter != '\n',159'Cannot use newline or carriage return has delimiter.');160161var EOF = goog.labs.format.csv.Sentinels_.EOF;162var EOR = goog.labs.format.csv.Sentinels_.EOR;163var NEWLINE = goog.labs.format.csv.Sentinels_.NEWLINE; // \r?\n164var EMPTY = goog.labs.format.csv.Sentinels_.EMPTY;165166var pushBackToken = null; // A single-token pushback.167var sawComma = false; // Special case for terminal comma.168169/**170* Push a single token into the push-back variable.171* @param {goog.labs.format.csv.Token} t Single token.172*/173function pushBack(t) {174goog.labs.format.csv.assertToken_(t);175goog.asserts.assert(goog.isNull(pushBackToken));176pushBackToken = t;177}178179/**180* @return {goog.labs.format.csv.Token} The next token in the stream.181*/182function nextToken() {183// Give the push back token if present.184if (pushBackToken != null) {185var c = pushBackToken;186pushBackToken = null;187return c;188}189190// We're done. EOF.191if (index >= text.length) {192return EOF;193}194195// Give the next charater.196var chr = text.charAt(index++);197goog.labs.format.csv.assertToken_(chr);198199// Check if this is a newline. If so, give the new line sentinel.200var isNewline = false;201if (chr == '\n') {202isNewline = true;203} else if (chr == '\r') {204// This is a '\r\n' newline. Treat as single token, go205// forward two indicies.206if (index < text.length && text.charAt(index) == '\n') {207index++;208}209210isNewline = true;211}212213if (isNewline) {214return NEWLINE;215}216217return chr;218}219220/**221* Read a quoted field from input.222* @return {string} The field, as a string.223*/224function readQuotedField() {225// We've already consumed the first quote by the time we get here.226var start = index;227var end = null;228229for (var token = nextToken(); token != EOF; token = nextToken()) {230if (token == '"') {231end = index - 1;232token = nextToken();233234// Two double quotes in a row. Keep scanning.235if (token == '"') {236end = null;237continue;238}239240// End of field. Break out.241if (token == delimiter || token == EOF || token == NEWLINE) {242if (token == NEWLINE) {243pushBack(token);244}245break;246}247248if (!opt_ignoreErrors) {249// Ignoring errors here means keep going in current field after250// closing quote. E.g. "ab"c,d splits into abc,d251throw new goog.labs.format.csv.ParseError(252text, index - 1,253'Unexpected character "' + token + '" after quote mark');254} else {255// Fall back to reading the rest of this field as unquoted.256// Note: the rest is guaranteed not start with ", as that case is257// eliminated above.258var prefix = '"' + text.substring(start, index);259var suffix = readField();260if (suffix == EOR) {261pushBack(NEWLINE);262return prefix;263} else {264return prefix + suffix;265}266}267}268}269270if (goog.isNull(end)) {271if (!opt_ignoreErrors) {272throw new goog.labs.format.csv.ParseError(273text, text.length - 1, 'Unexpected end of text after open quote');274} else {275end = text.length;276}277}278279// Take substring, combine double quotes.280return text.substring(start, end).replace(/""/g, '"');281}282283/**284* Read a field from input.285* @return {string|!goog.labs.format.csv.Sentinels_} The field, as a string,286* or a sentinel (if applicable).287*/288function readField() {289var start = index;290var didSeeComma = sawComma;291sawComma = false;292var token = nextToken();293if (token == EMPTY) {294return EOR;295}296if (token == EOF || token == NEWLINE) {297if (didSeeComma) {298pushBack(EMPTY);299return '';300}301return EOR;302}303304// This is the beginning of a quoted field.305if (token == '"') {306return readQuotedField();307}308309while (true) {310// This is the end of line or file.311if (token == EOF || token == NEWLINE) {312pushBack(token);313break;314}315316// This is the end of record.317if (token == delimiter) {318sawComma = true;319break;320}321322if (token == '"' && !opt_ignoreErrors) {323throw new goog.labs.format.csv.ParseError(324text, index - 1, 'Unexpected quote mark');325}326327token = nextToken();328}329330331var returnString = (token == EOF) ?332text.substring(start) : // Return to end of file.333text.substring(start, index - 1);334335return returnString.replace(/[\r\n]+/g, ''); // Squash any CRLFs.336}337338/**339* Read the next record.340* @return {!Array<string>|!goog.labs.format.csv.Sentinels_} A single record341* with multiple fields.342*/343function readRecord() {344if (index >= text.length) {345return EOF;346}347var record = [];348for (var field = readField(); field != EOR; field = readField()) {349record.push(field);350}351return record;352}353354// Read all records and return.355var records = [];356for (var record = readRecord(); record != EOF; record = readRecord()) {357records.push(record);358}359return records;360};361362363/**364* Sentinel tracking objects.365* @enum {!Object}366* @private367*/368goog.labs.format.csv.Sentinels_ = {369/** Empty field */370EMPTY: {},371372/** End of file */373EOF: {},374375/** End of record */376EOR: {},377378/** Newline. \r?\n */379NEWLINE: {}380};381382383/**384* @param {string} str A string.385* @return {boolean} Whether the string is a single character.386* @private387*/388goog.labs.format.csv.isCharacterString_ = function(str) {389return goog.isString(str) && str.length == 1;390};391392393/**394* Assert the parameter is a token.395* @param {*} o What should be a token.396* @throws {goog.asserts.AssertionError} If {@ code} is not a token.397* @private398*/399goog.labs.format.csv.assertToken_ = function(o) {400if (goog.isString(o)) {401goog.asserts.assertString(o);402goog.asserts.assert(403goog.labs.format.csv.isCharacterString_(o),404'Should be a string of length 1 or a sentinel.');405} else {406goog.asserts.assert(407goog.object.containsValue(goog.labs.format.csv.Sentinels_, o),408'Should be a string of length 1 or a sentinel.');409}410};411412413