Friday, April 26, 2013

String Extensions: Split Qualified

Splitting on a string is easy.
Respecting qualified (quoted) strings can be hard.
Identifying escaped characters in qualified strings is very tricky.
Splitting on a qualified string that takes escape characters into account is really difficult!

Unit Tests

[Theory]
[InlineData(null,                   new string[0])]
[InlineData("",                     new string[0])]
[InlineData("hello world",          new[] { "hello", "world" })]
[InlineData("hello   world",        new[] { "hello", "world" })]
[InlineData("\"hello world\"",      new[] { "\"hello world\"" })]
[InlineData("\"hello  world\"",     new[] { "\"hello  world\"" })]
[InlineData("hello \"goodnight moon\" world", new[]
{
    "hello", 
    "\"goodnight moon\"", 
    "world", 
})]
[InlineData("hello \"goodnight \\\" moon\" world", new[]
{
    "hello", 
    "\"goodnight \\\" moon\"", 
    "world", 
})]
[InlineData("hello \"goodnight \\\\\" moon\" world", new[]
{
    "hello", 
    "\"goodnight \\\\\"", 
    "moon\"", 
    "world", 
})]
public void SplitQualified(string input, IList<string> expected)
{
    var actual = input
        .SplitQualified(' ', '"')
        .ToList();
 
    Assert.Equal(expected.Count, actual.Count);
 
    for (var i = 0; i < actual.Count; i++)
        Assert.Equal(expected[i], actual[i]);
}

String Extension Methods

public static IEnumerable<string> SplitQualified(
    this string input, 
    char separator, 
    char qualifier, 
    StringSplitOptions options = StringSplitOptions.RemoveEmptyEntries, 
    char escape = '\\')
{
    if (String.IsNullOrWhiteSpace(input))
        return new string[0];
 
    var results = SplitQualified(input, separator, qualifier, escape);
 
    return options == StringSplitOptions.None
        ? results
        : results.Where(r => !String.IsNullOrWhiteSpace(r));
}
 
private static IEnumerable<string> SplitQualified(
    string input, 
    char separator, 
    char qualifier, 
    char escape)
{
    var separatorIndexes = input
        .IndexesOf(separator)
        .ToList();
 
    var qualifierIndexes = input
        .IndexesOf(qualifier)
        .ToList();
 
    // Remove Escaped Qualifiers
    for (var i = 0; i < qualifierIndexes.Count; i++)
    {
        var qualifierIndex = qualifierIndexes[i];
        if (qualifierIndex == 0)
            continue;
 
        if (input[qualifierIndex - 1] != escape)
            continue;
 
        // Watch out for a series of escaped escape characters.
        var escapeResult = false;
        for (var j = 2; qualifierIndex - j > 0; j++)
        {
            if (input[qualifierIndex - j] == escape)
                continue;
 
            escapeResult = j % 2 == 1;
            break;
        }
 
        if (qualifierIndex > 1 && escapeResult)
            continue;
 
        qualifierIndexes.RemoveAt(i);
        i--;
    }
 
    // Remove Qualified Separators
    if (qualifierIndexes.Count > 1)
        for (var i = 0; i < separatorIndexes.Count; i++)
        {
            var separatorIndex = separatorIndexes[i];
 
            for (var j = 0; j < qualifierIndexes.Count - 1; j += 2)
            {
                if (separatorIndex <= qualifierIndexes[j])
                    continue;
 
                if (separatorIndex >= qualifierIndexes[j + 1])
                    continue;
 
                separatorIndexes.RemoveAt(i);
                i--;
            }
        }
 
    // Split String On Separators
    var previousSeparatorIndex = 0;
    foreach (var separatorIndex in separatorIndexes)
    {
        var startIndex = previousSeparatorIndex == 0
            ? previousSeparatorIndex
            : previousSeparatorIndex + 1;
 
        var endIndex = separatorIndex == input.Length - 1
            || previousSeparatorIndex == 0
            ? separatorIndex - previousSeparatorIndex
            : separatorIndex - previousSeparatorIndex - 1;
 
        yield return input.Substring(startIndex, endIndex);
 
        previousSeparatorIndex = separatorIndex;
    }
 
    if (previousSeparatorIndex == 0)
        yield return input;
    else
        yield return input.Substring(previousSeparatorIndex + 1);
}
 
public static IEnumerable<int> IndexesOf(
    this string input, 
    char value)
{
    if (!String.IsNullOrWhiteSpace(input))
    {
        var index = -1;
        do
        {
            index++;
            index = input.IndexOf(value, index);
 
            if (index > -1)
                yield return index;
            else
                break;
        }
        while (index < input.Length);
    }
}
Shout it

Enjoy,
Tom

1 comment:

  1. Nice.

    I especially enjoy the reference to Goodnight Moon.

    ReplyDelete

Real Time Web Analytics