WONTFIX: 2009

Saturday, 26 December 2009

Erlang CSV Parser

At long last I've finished the CSV parser I've been working on. Most of the technical problems have been string matching problems, and that a string is a list of integers that can be matched singly prefixing a dollar sign to the char that should be matched.

The parser uses a state machine implemented by using a process and messages the next char until it reaches the end of the file/string at which point it messages an eof atom and awaits the process to message back the parsed CSV. In the end the parser used quite a lot of erlang's features including processes, funs and parametrised macros and the end result was pretty clean. It can take a plain string or an IO device such as a file as the string source which is handled in a nice way using funs to get the next char. I found the switch from OOP to functional confusing at first since I wanted to use an input stream but the functional method I discovered is probably smaller than the Java stream based approach I would have used otherwise.

Other notable CSV parsers include ppolv's and an FSM OTP behaviour from Praveen Ray of Yellowfish. I'm really impressed by the OTP behaviour, I can imagine this would improve reuse once comfortable with erlang and the OTP.

	%% @author atill
	%%
	%% @doc
	%% CSV parsing module. Parsed CSV will be processed and converted
	%% into a list containing the separated values, lines are separated
	%% by the newline atom.

	-module(csv).

	-import(lists, [reverse/1]).

	-export([parse_from_io/1, parse_from_string/1, iterate_chars/4]).

	-export([parser_process/0]).

	%% @doc parse using an io device such as file as the string source
	parse_from_io(IoDevice) ->;
	IoDeviceIterator = fun(Io) ->
	{io:get_chars(Io, "", 1), Io}
	end,
	iterate_chars(spawn(?MODULE, parser_process, []), IoDeviceIterator, IoDevice).

	%% @doc parse csv from a string
	parse_from_string(String) ->
	StringIterator = fun(StringList) ->
	get_first_char(StringList)
	end,
	iterate_chars(spawn(?MODULE, parser_process, []), StringIterator, String).

	%% @doc function used internally for the parser process, do NOT use!
	parser_process() ->
	ready().

	%%
	%% Local Functions
	%%

	iterate_chars(ParserPid, IteratorFun, TextSource) ->
	{FirstChar, UpdatedTextSource} = IteratorFun(TextSource),

	iterate_chars(ParserPid, IteratorFun, UpdatedTextSource, FirstChar).

	iterate_chars(Pid, _, _, eof) ->
	Pid ! {eof, self()},
	receive
	{ParsedCsv} ->
	ParsedCsv
	end;

	iterate_chars(Pid, IteratorFun, TextSource, Char) ->
	Pid ! {clean_char_argument(Char)},

	{FirstChar, UpdatedTextSource} = IteratorFun(TextSource),

	iterate_chars(Pid, IteratorFun, UpdatedTextSource, FirstChar).

	%% @doc make sure that an integer denoting a char is returned instead of a string
	clean_char_argument([CharInt \| _]) ->
	CharInt;
	clean_char_argument(CharInt) when is_integer(CharInt) ->
	CharInt.

	%% @doc returns tuple {FirstChar, RemainingChars} or {eof, []} if no more chars
	%% remains
	get_first_char([]) ->
	{eof, []};
	get_first_char([FirstChar \| Tail]) ->
	{FirstChar, Tail}.

	%%
	%% CSV State Machine
	%%

	-define(EMPTY_STRING, []).

	-define(CSV_EOF_PATTERN, {eof, ResultPid}).
	-define(CSV_EOF,
	CsvLine = reverse([reverse(CurrentValue) \| ParsedCsv]),
	ResultPid ! {CsvLine}
	).

	% the ready state awaits chars to be passed to it and builds the a string
	% between the value delimiter.
	% if a quote is encountered then the in_quotes state is moved to.
	ready() ->
	ready([], []).
	ready(ParsedCsv, CurrentValue) ->
	receive
	{Char} when (Char == $") or (Char == $') ->
	% pass an empty string to in_quotes as we do not want the
	% preceeding characters to be included, only those in quotes
	in_quotes(ParsedCsv, ?EMPTY_STRING, Char);
	{Char} when Char == $, ->
	ready([reverse(CurrentValue) \| ParsedCsv], ?EMPTY_STRING);
	{Char} when Char == $\n ->
	% insert a newline atom when a newline char is received
	List = [newline \| [reverse(CurrentValue) \| ParsedCsv]],
	ready(List, ?EMPTY_STRING);
	{Char} when Char == $\r ->
	% ignore line feed characters
	ready(ParsedCsv, CurrentValue);
	{Char} ->
	ready(ParsedCsv, [Char \| CurrentValue]);
	?CSV_EOF_PATTERN ->
	?CSV_EOF
	end.

	% the in_quotes state adds all chars it receives to the value string until
	% it receives a char matching the initial quote in which case it moves to
	% the skip_to_delimiter state.
	in_quotes(ParsedCsv, CurrentValue, QuoteChar) ->
	receive
	{Char} when Char == QuoteChar ->
	skip_to_delimiter([reverse(CurrentValue) \| ParsedCsv]);
	{Char} ->
	in_quotes(ParsedCsv, [Char \| CurrentValue], QuoteChar);
	?CSV_EOF_PATTERN ->
	?CSV_EOF
	end.

	% the skip_to_delimiter awaits chars which will get thrown away, when a
	% value delimiter is received the machine moves to the ready state again.
	skip_to_delimiter(ParsedCsv) ->
	receive
	{Char} when Char == $, ->
	ready(ParsedCsv, ?EMPTY_STRING);
	?CSV_EOF_PATTERN ->
	% we are not building a value with the chars we receive so just
	% pass the already parsed list
	ResultPid ! {reverse(ParsedCsv)};
	{_} ->
	skip_to_delimiter(ParsedCsv)
	end.

view raw csv.erl hosted with ❤ by GitHub

Friday, 25 December 2009

Scribefire

I'm now blogging using scribefire which is superb so far, check it out if you're using your blogs standard editor.

Merry XMAS.

Learing Erlang

I've been learning Erlang on and off for a while and I still can't believe how high the bar to entry is to create something relatively simple. By this I mean that I mean the large amount of new skills and tools that I need to be familiar with before I even get coding. I'm not talking about the language itself here as I understand that a learning my first functional language isn't going to be an easy switch as it would be for C# to Java or vice versa.

I've run into two hurdles before tonight which has slowed me down and one just now which has compelled me to write about it. Number one is a bug in eunit (a bug in a unit testing framework!!!), this really had me stumped for a while. I believed I was working from a copy that I checked out from the eunit SVN repository and dutifully went through the code:add_path motions to update the repo to get the bug fix. However eunit is actually part of the core libraries so I had to apply the given patch to the erlang distribution, in the end downloading the newest Erlang fixed the problem.

All through development I've had a problem automating the compilation of my code, going down a dead end path with erlangantlib then make not being able to interpret the eunit make file when I was tinkering with it as above. This was all crappy and gave me a bad feeling but this is all part of getting to know the language that doesn't have IBM, Sun or Microsoft backing it. Mores the pity.

The final bullet was the debugger not being able to start and giving an erl.exe has encountered a problem message. I know why because the excuse is in the Erlang Programming book, the tools in Erlang to my knowledge are all built using the TCL UI toolkit which is really crap on Windows. Cheers, this feels like a real waste of my time. Am I really supposed to start using Linux just to work proficiently in Erlang?

I'm not going to give up on Erlang just yet, its features are still far too compelling to let go of.

Monday, 14 December 2009

Checked Exceptions Part 2

So after the big rant over checked exceptions I've been bitten by the problems that come with the them, not for the first time but it seemed all the more acute since I just wrote about them. By modifying the signature of a method to convert ints to enum types I started a change that meant modifying forty files from the the depths of the application to it's highest spire, and overall creating three different exceptions to encapsulate the added exception and IOExceptions that could already occur. Take into account that some of these files were unit tests that would need to change to throw the exception I fit occurred but did not need ant more modification to pass.

The problem with specifying a checked exception here would be the same if Integer.valueOf(String) threw a checked version of NumberFormatException, try-catches and throws statements appeared everywhere. This case is pretty much the worst case for this change as the method is used in more places than I expected, this is an API meant for eventual consumption by people who I may never meet. Who may not have the code there to view the documentation. I left work tonight without committing that work into SVN to think whether this change was really worth all the upheaval it had caused.

To clarify, the method already threw an IllegalArgumentException that was duly documented. The reason I created a custom checked exception was because the unthinkable happened, it actually threw the runtime exception and although it was caught it didn't give me any comfort. Luckily the the method executing the logic throwing the exception handled and logged it. While adding the tries and throws I realised that it was only caught appropriately in about 20% of the cases that it was used. I think this is a good estimate at how many runtime exceptions are caught appropriately in most software, luckily things mostly go our way ;)

After some serious thought I realised that had the method threw the exception when it was created the same amount of error handling code would exist and I would be happy because now the software is more robust. The exception will never be thrown out of core application code i.e. libraries such as Swing with unforeseen results.

The lesson I learned from this isn't that checked exceptions are hard, it's that they are hard to change after the code is in use. This is in exactly the same way that changing method parameters is hard once the code is in use. A couple of things to look out for next time:

Next time the method is going to throw runtime exception perhaps a custom checked exception should be thrown. The runtime exception will most likely not be caught, should the software be allowed to crash if this happens?

Exception hierarchy is your friend. By throwing subclasses of MyCompanyException (which is common but pointless on its own) catchers can easily cope with and log this but many custom versions can be created for more fine grained strategies. Also this may already be caught so no modification required in those cases.

One more conclusion that surprised me is that I actually found some existing errors lurking in code, two stream.close() cases that weren't contained in a finally block. If an exception was thrown then they would not be closed. Not the worst problem in the world but it shows how when I was in error handling mode how I saw code differently.

Sunday, 6 December 2009

UncaughtExceptionHandler

One more thing on exceptions tonight. Always remember to add an UncaughtExceptionHandler to any Thread that you create, the default implementation can just log using your framework of choice but you will be glad of it when an exception occurs. You can install the exception handler using the static Thread.setDefaultUncaughtExceptionHandler(UncaughtExceptionHandler) method.

Checked Exceptions

I've always felt that exceptions were never used to their full potential in my code. It is so common in software to do a try-catch and log the exception that perhaps there should be a keyword built into the language for this :)

Every time the system logs or returns null because something couldn't happen then maybe a checked exception should be thrown.

Before using the eclipse refactor “surround with try-catch” we have to think is this really the best and most flexible way to handle the potential problem? For example say you have a method with a signature of void foo(obj) and something goes wrong, the client may be able to handle the problem but at least needs to be aware that something went wrong, indicating that the task didn't complete as expected.

The are a couple of problems with this
Thrown exceptions become part of the API so don't be too implementation specific.
Custom exceptions are not so easy to create, a new class needs to be created, more code more maintenance.
More responsibility on the API consumer and code duplication, catching exceptions and handling them in the same way many times throughout the code.

The argument that checked exceptions pollute code within implementation specific details is probably the biggest argument against them, and rightly so. For example if a component used a file to store data it might throw an IOException. Later on you might scale that up to a database, now you want to throw an SQLException but you can't change it because backwards compatibility would be broken. Not too bad if you own all of the code but an impossibility if you do. The simple work around is to create an exception for this situation and put the exception as the cause in the Exception(Throwable) constructor.

It is a shame that types in Java are not easily created, it would be great to do a typedef where the result is a completely new type. It would be especially good if generics could be included in that way like templates can in c++. This idea came about from reading about Erlang atoms which are a value represent only themselves and can be easily created so custom exceptions are cheap to create.

I saw some excellent c++ code not so long ago, types with templates were typedef'd into a type with a legible name. This was great although I'm not sure using it globally a lot is great idea, locally within a class would be excellent.

Lastly putting the onus on the consumer of the code to handle the exception correctly could be potentially dangerous. There are a lot of ways to badly handle exceptions, wrap in a RuntimeException and throw being the worst in most situations. This could happen because the code is in a Runnable or other interface which does not allow a checked exception to be thrown but doesn't make me feel any better about it.

Even with all of this, checked exceptions are the only maintainable way of using exceptions in your code as an error handling strategy. Can you imagine looking for which exceptions should be caught even a few calls deep? This is almost impossible and a maintenance nightmare.

There are some good counter arguments to all of these points and more below. Although I do agree that there are problems, especially in the google testing article the predictability of checked exceptions is awsome.

http://googletesting.blogspot.com/2009/09/checked-exceptions-i-love-you-but-you.html
and...
http://www.artima.com/intv/handcuffs.html

Thursday, 19 November 2009

Java Memory Debug Command Line Settings

Analysing memory usage and checking for memory leaks is an integral part of any Java application developmkent. This is especially true for swing apps, where swing design lends itself to creating leaks (lots of listeners with different lifespans, some of them static, SWING TIMER!).

Here's my typical command line arguments when I'm running in debug mode, enjoy.

set MEMORY_DEBUG_ARGS=-agentlib:hprof=file=myheapdump.hprof,format=b -verbose:gc -XX:+HeapDumpOnOutOfMemoryError -XX:+HeapDumpOnCtrlBreak

set LOW_MEMORY_HEAP=-ms128m -mx128m

Tuesday, 3 November 2009

Builder Pattern

I've gotten round to improving some of the more complicated domain objects today by making them a) immutable b) use builders and c) use default serialisation techniques which has been an overall success.

Making the objects has become quite necessary since objects are serialised throughout our distributed system and the instance you're holding might be stale and any of the fields changed on this may overwrite the latest data, so really you want to request a change and some time later receive the updated object and typically display it until all that happens again.

The best way of doing this that I've found is the builder pattern from the Effective Java (2nd edition) book, while also trumping some of the other builder patterns I've seen. Basically the domain object has an inner Builder class with mutable data, and a build method which returns the domain object with it's data passed to the objects constructor. I'm not going to give a code sample since no one is reading and you should check out the book, go to Waterstones :P

Other problems have been finding the true identity of an object for equals and hashCode, is it really a combination of all of the fields or is it a select few containing the true ID of the object? Its far better to keep this data in it's own class or else the clutter will consume the intent of your code.

Serialisation was hit and miss, the end result was turning a class that serialised into 35 bytes into one that took 208. More investigation is needed because there is far more to it than I originally thought. I'm not really impressed by all of the magic methods like readObject, readResolve etc. Shouldn't this have been wrapped up in an interface, if only to provide some code completion?

Wednesday, 21 October 2009

Guice in a Java Swing App

Recently I have been working on a project that as normal uses Swing but decided to use Guice to avoid some of the problems that I’ve encountered in previous projects.

The problem is that if you have components that rely on service type interfaces that are nested several layers deep (not that unlikely in a complex UI) then to pass the service to the component all the other components above it need to have methods/constructors that pass the service down the hierarchy. This culminates in a simple change to a component that needs a reference to a new service requiring changes to 4/5 classes that also now have references to something they didn’t need to know about. This means extra coupling and more pain when you’re trying to refactor.

So now there are panels which have references to components that are themselves injected, which have injected services and everything works. It just works. In fact I barely think about it anymore, components with dependencies are just injected and there are no problems, no extra dependencies and no extra work. If the injected components also need service dependant components then they are injected as well and have no impact up the hierarchy.

Many classes in the project now have no code depending on the way they are created, which is a real joy. I haven’t heard this benefit touted by IoC propaganda but it really should be. If you ever wondered if adding dependency injection was overkill for you project, note down what you would inject. If there is more than two (my own measurement) injectable classes then the gains will be more than the cost of complexity in setting up the IoC infrastructure with modules and injectors etc.

Friday, 22 May 2009

DMA

I've had a frustrating day wrestling with my sound recording setup. The Mircotrack 2 that I'm using seems to have toasted one of my compact flash cards, now it crashes Windows Explorer!

The problem was fixed by turning off DMA on the Microtrack recorder and now it's recording properly again. DMA is a mechanism for circumventing the CPU, while writing to memory on a different device.

Thanks to Mr Sandiford for figuring this out :)

Wednesday, 4 March 2009

Referencing Enclosing Instance of an Inner Class

When using an inner class how do you reference the enclosing class? The answer is Object.this where Object is the class type of the enclosing class. Sweet :)

Sunday, 11 January 2009

Setting a custom cursor which doesn't get resized

When setting a custom cursor in Swing you shouldn't really rely on the createCustomCursor method to use your images in any respectful way. The behaviour of this method is to resize the image into the dimensions returned by the getBestCursorSize method, which on Windows XP always seems to return 32x32 pixels.

To me this seems pretty crap because the image is going to get resized at some point depending on the platform your app is being run on which will most likely make the cursor image look terrible, perhaps to the user, unusable. IMO the behaviour should be to create a new image of the dimensions of getBestCursorSize and draw the supplied image at point 0,0. This does cause a problem if your image is larger than the dimension but your would be screwed the default way anyway.

Implementation is below...

import java.awt.Dimension;
import java.awt.Graphics2D;
import java.awt.Image;
import java.awt.Toolkit;
import java.awt.image.BufferedImage;

public class SizedCursor
{
  public static Image getPreferredSizedCursor(
      Image image)
  {
    Dimension bestDimension = Toolkit
        .getDefaultToolkit()
        .getBestCursorSize(
          image
              .getWidth(null),
          image
              .getHeight(null));
    
    if (bestDimensionsEqualsImageSize(
      image,
      bestDimension))
    {
      return image;
    }
    else
    {
      BufferedImage resizedImage = new BufferedImage(
          bestDimension.width,
          bestDimension.height,
          BufferedImage.TYPE_INT_ARGB);
      Graphics2D g = (Graphics2D) resizedImage
          .getGraphics();
      
      g.drawImage(
        image, 0,
        0, null);
      
      return resizedImage;
    }
  }
  
  private static boolean bestDimensionsEqualsImageSize(
      Image image,
      Dimension bestDimension)
  {
    return bestDimension
        .getWidth() == image
        .getWidth(null)
        && bestDimension
            .getHeight() == image
            .getHeight(null);
  }
}