At long last I've finished the CSV parser I've been working on. Most of the technical problems have been string matching problems, and that a string is a list of integers that can be matched singly prefixing a dollar sign to the char that should be matched.
The parser uses a state machine implemented by using a process and messages the next char until it reaches the end of the file/string at which point it messages an eof atom and awaits the process to message back the parsed CSV. In the end the parser used quite a lot of erlang's features including processes, funs and parametrised macros and the end result was pretty clean. It can take a plain string or an IO device such as a file as the string source which is handled in a nice way using funs to get the next char. I found the switch from OOP to functional confusing at first since I wanted to use an input stream but the functional method I discovered is probably smaller than the Java stream based approach I would have used otherwise.
Other notable CSV parsers include ppolv's and an FSM OTP behaviour from Praveen Ray of Yellowfish. I'm really impressed by the OTP behaviour, I can imagine this would improve reuse once comfortable with erlang and the OTP.
The parser uses a state machine implemented by using a process and messages the next char until it reaches the end of the file/string at which point it messages an eof atom and awaits the process to message back the parsed CSV. In the end the parser used quite a lot of erlang's features including processes, funs and parametrised macros and the end result was pretty clean. It can take a plain string or an IO device such as a file as the string source which is handled in a nice way using funs to get the next char. I found the switch from OOP to functional confusing at first since I wanted to use an input stream but the functional method I discovered is probably smaller than the Java stream based approach I would have used otherwise.
Other notable CSV parsers include ppolv's and an FSM OTP behaviour from Praveen Ray of Yellowfish. I'm really impressed by the OTP behaviour, I can imagine this would improve reuse once comfortable with erlang and the OTP.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% @author atill | |
%% | |
%% @doc | |
%% CSV parsing module. Parsed CSV will be processed and converted | |
%% into a list containing the separated values, lines are separated | |
%% by the newline atom. | |
-module(csv). | |
-import(lists, [reverse/1]). | |
-export([parse_from_io/1, parse_from_string/1, iterate_chars/4]). | |
-export([parser_process/0]). | |
%% @doc parse using an io device such as file as the string source | |
parse_from_io(IoDevice) ->; | |
IoDeviceIterator = fun(Io) -> | |
{io:get_chars(Io, "", 1), Io} | |
end, | |
iterate_chars(spawn(?MODULE, parser_process, []), IoDeviceIterator, IoDevice). | |
%% @doc parse csv from a string | |
parse_from_string(String) -> | |
StringIterator = fun(StringList) -> | |
get_first_char(StringList) | |
end, | |
iterate_chars(spawn(?MODULE, parser_process, []), StringIterator, String). | |
%% @doc function used internally for the parser process, do NOT use! | |
parser_process() -> | |
ready(). | |
%% | |
%% Local Functions | |
%% | |
iterate_chars(ParserPid, IteratorFun, TextSource) -> | |
{FirstChar, UpdatedTextSource} = IteratorFun(TextSource), | |
iterate_chars(ParserPid, IteratorFun, UpdatedTextSource, FirstChar). | |
iterate_chars(Pid, _, _, eof) -> | |
Pid ! {eof, self()}, | |
receive | |
{ParsedCsv} -> | |
ParsedCsv | |
end; | |
iterate_chars(Pid, IteratorFun, TextSource, Char) -> | |
Pid ! {clean_char_argument(Char)}, | |
{FirstChar, UpdatedTextSource} = IteratorFun(TextSource), | |
iterate_chars(Pid, IteratorFun, UpdatedTextSource, FirstChar). | |
%% @doc make sure that an integer denoting a char is returned instead of a string | |
clean_char_argument([CharInt | _]) -> | |
CharInt; | |
clean_char_argument(CharInt) when is_integer(CharInt) -> | |
CharInt. | |
%% @doc returns tuple {FirstChar, RemainingChars} or {eof, []} if no more chars | |
%% remains | |
get_first_char([]) -> | |
{eof, []}; | |
get_first_char([FirstChar | Tail]) -> | |
{FirstChar, Tail}. | |
%% | |
%% CSV State Machine | |
%% | |
-define(EMPTY_STRING, []). | |
-define(CSV_EOF_PATTERN, {eof, ResultPid}). | |
-define(CSV_EOF, | |
CsvLine = reverse([reverse(CurrentValue) | ParsedCsv]), | |
ResultPid ! {CsvLine} | |
). | |
% the ready state awaits chars to be passed to it and builds the a string | |
% between the value delimiter. | |
% if a quote is encountered then the in_quotes state is moved to. | |
ready() -> | |
ready([], []). | |
ready(ParsedCsv, CurrentValue) -> | |
receive | |
{Char} when (Char == $") or (Char == $') -> | |
% pass an empty string to in_quotes as we do not want the | |
% preceeding characters to be included, only those in quotes | |
in_quotes(ParsedCsv, ?EMPTY_STRING, Char); | |
{Char} when Char == $, -> | |
ready([reverse(CurrentValue) | ParsedCsv], ?EMPTY_STRING); | |
{Char} when Char == $\n -> | |
% insert a newline atom when a newline char is received | |
List = [newline | [reverse(CurrentValue) | ParsedCsv]], | |
ready(List, ?EMPTY_STRING); | |
{Char} when Char == $\r -> | |
% ignore line feed characters | |
ready(ParsedCsv, CurrentValue); | |
{Char} -> | |
ready(ParsedCsv, [Char | CurrentValue]); | |
?CSV_EOF_PATTERN -> | |
?CSV_EOF | |
end. | |
% the in_quotes state adds all chars it receives to the value string until | |
% it receives a char matching the initial quote in which case it moves to | |
% the skip_to_delimiter state. | |
in_quotes(ParsedCsv, CurrentValue, QuoteChar) -> | |
receive | |
{Char} when Char == QuoteChar -> | |
skip_to_delimiter([reverse(CurrentValue) | ParsedCsv]); | |
{Char} -> | |
in_quotes(ParsedCsv, [Char | CurrentValue], QuoteChar); | |
?CSV_EOF_PATTERN -> | |
?CSV_EOF | |
end. | |
% the skip_to_delimiter awaits chars which will get thrown away, when a | |
% value delimiter is received the machine moves to the ready state again. | |
skip_to_delimiter(ParsedCsv) -> | |
receive | |
{Char} when Char == $, -> | |
ready(ParsedCsv, ?EMPTY_STRING); | |
?CSV_EOF_PATTERN -> | |
% we are not building a value with the chars we receive so just | |
% pass the already parsed list | |
ResultPid ! {reverse(ParsedCsv)}; | |
{_} -> | |
skip_to_delimiter(ParsedCsv) | |
end. |