DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Regex Splitter

11.10.2008
| 9595 views |
  • submit to reddit
        Splits a string using regex delimiters, optionally also keeping the delimiters.

Example:
> java Splitter "\W+" "Hello World!"
Part 1: "Hello"
Part 2: " "
Part 3: "World"
Part 4: "!"
Part 5: ""


Source code:
import java.util.regex.*;
import java.util.LinkedList;

/**
 * Splits a string using regex delimiters, optionally keeping the delimiters.
 *
 * In response to a <a href="http://stackoverflow.com/questions/275768">Stackoverflow challenge</a>.
 */
public class Splitter {
    /** Default pattern. */
    private static final Pattern DEFAULT_PATTERN = Pattern.compile("\\s+");

    /** Chosen pattern */
    private Pattern pattern;

    /** Flag for keeping the delimiters */
    private boolean keep_delimiters;

    /**
     * Constructs a new Splitter object.
     * 
     * @param pattern
     *          Pattern to use. Default is '\s+', meaning any whitespace.
     * @param keep_delimiters
     *          Flag to keep delimiters. Default is 'true'.
     */
    public Splitter(Pattern pattern, boolean keep_delimiters) {
        this.pattern = pattern;
        this.keep_delimiters = keep_delimiters;
    }
    public Splitter(String pattern, boolean keep_delimiters) {
        this(Pattern.compile(pattern==null?"":pattern), keep_delimiters);
    }
    public Splitter(Pattern pattern) { this(pattern, true); }
    public Splitter(String pattern) { this(pattern, true); }
    public Splitter(boolean keep_delimiters) { this(DEFAULT_PATTERN, keep_delimiters); }
    public Splitter() { this(DEFAULT_PATTERN); }

    /**
     * Splits a string using the pattern.
     * 
     * @return  Array of strings with each part. If keep_delimiters is active,
     * the indices will contain the matched delimiters.
     */
    public String[] split(String text) {
        if (text == null) {
            text = "";
        }

        int last_match = 0;
        LinkedList<String> splitted = new LinkedList<String>();

        Matcher m = this.pattern.matcher(text);

        // Iterate trough each match
        while (m.find()) {
            // Text since last match
            splitted.add(text.substring(last_match,m.start()));

            // The delimiter itself
            if (this.keep_delimiters) {
                splitted.add(m.group());
            }

            last_match = m.end();
        }
        // Trailing text
        splitted.add(text.substring(last_match));

        return splitted.toArray(new String[splitted.size()]);
    }

    /**
     * Main method. Takes arguments from the command line and runs .split on
     * them.
     *
     * @param argv
     *          Arguments from the command line.
     */
    public static void main(String[] argv) {
        if (argv.length == 1 && argv[0].equals("-test")) {
            Splitter.run_tests();
            return;
        }

        if (argv.length != 2) {
            System.err.println("Syntax: java Splitter <pattern> <text>");
            System.err.println("        java Splitter -test");
            return;
        }

        // Compile pattern
        Pattern pattern = null;
        try {
            pattern = Pattern.compile(argv[0]);
        }
        catch (PatternSyntaxException e) {
            System.err.println(e);
            return;
        }

        Splitter splitter = new Splitter(pattern);

        // Iterate trough each part
        String text = argv[1];
        int counter = 1;
        for (String part : splitter.split(text)) {
            System.out.printf("Part %d: \"%s\"\n", counter++, part);
        }
    }
    
    public static void run_tests() {
        String[][] test_cases = {
            // Limit cases:
            // 'null' to be splitted with regexp 'null' gives []
            { null, null },
            // '' to be splitted with regexp 'null' gives []
            { "", null },
            // 'null' to be splitted with regexp '' gives []
            { null, "" },
            // '' to be splitted with regexp '' gives []
            { "", "" },

            // Border cases:
            // 'abcd' to be splitted with regexp 'ab' gives [ab], 'cd', []
            { "abcd", "ab" },
            // 'abcd' to be splitted with regexp 'cd' gives [], 'ab', [cd]
            { "abcd", "cd" },
            // 'abcd' to be splitted with regexp 'abcd' gives [abcd]
            { "abcd", "abcd" },
            // 'abcd' to be splitted with regexp 'bc' gives [], 'a', [bc], 'd', []
            { "abcd", "bc" },

            // Real cases:
            // 'abcd    efg  hi   j' to be splitted with regexp '[ \t\n\r\f]+'
            //   gives [], 'abcd', [   ], 'efg', [  ], 'hi', [   ], 'j', []
            { "abcd    efg  hi   j", "[ \\t\\n\\r\\f]+" }, 
            // ''ab','cd','eg'' to be splitted with regexp '\W+'
            //   gives ['], 'ab', [','], 'cd', [','], 'eg', [']
            { "'ab','cd','eg'", "\\W+" },

            // Split-like cases:
            // 'boo:and:foo' to be splitted with regexp ':'
            //     gives [], 'boo', [:], 'and', [:], 'foo', []
            { "boo:and:foo", ":" },
            // 'boo:and:foo' to be splitted with regexp 'o'
            //     gives [], 'b', [o], '', [o], ':and:f', [o], '', [o]
            { "boo:and:foo", "o" },
            // 'boo:and:foo' to be splitted with regexp 'o+'
            //     gives [], 'b', [oo], ':and:f', [oo]
            { "boo:and:foo", "o+" }
        };

        int test_counter = 1;
        for (String[] test : test_cases) {
            String text = test[0];
            String pattern = test[1];

            System.out.printf("Test case #%d:\n", test_counter++);
            System.out.printf("  Text:    '%s'\n", text);
            System.out.printf("  Pattern: /%s/\n", pattern);
            System.out.printf("  Parts:\n");

            Splitter splitter = new Splitter(pattern, true);

            int part_counter = 1;
            for (String part : splitter.split(text)) {
                System.out.printf("    %2d) '%s'\n", part_counter++, part);
            }

            System.out.println();
        }
    }
}

Test cases:
> java Splitter -test
Test case #1:
  Text:    'null'
  Pattern: /null/
  Parts:
     1) ''
     2) ''
     3) ''

Test case #2:
  Text:    ''
  Pattern: /null/
  Parts:
     1) ''
     2) ''
     3) ''

Test case #3:
  Text:    'null'
  Pattern: //
  Parts:
     1) ''
     2) ''
     3) ''

Test case #4:
  Text:    ''
  Pattern: //
  Parts:
     1) ''
     2) ''
     3) ''

Test case #5:
  Text:    'abcd'
  Pattern: /ab/
  Parts:
     1) ''
     2) 'ab'
     3) 'cd'

Test case #6:
  Text:    'abcd'
  Pattern: /cd/
  Parts:
     1) 'ab'
     2) 'cd'
     3) ''

Test case #7:
  Text:    'abcd'
  Pattern: /abcd/
  Parts:
     1) ''
     2) 'abcd'
     3) ''

Test case #8:
  Text:    'abcd'
  Pattern: /bc/
  Parts:
     1) 'a'
     2) 'bc'
     3) 'd'

Test case #9:
  Text:    'abcd    efg  hi   j'
  Pattern: /[ \t\n\r\f]+/
  Parts:
     1) 'abcd'
     2) '    '
     3) 'efg'
     4) '  '
     5) 'hi'
     6) '   '
     7) 'j'

Test case #10:
  Text:    ''ab','cd','eg''
  Pattern: /\W+/
  Parts:
     1) ''
     2) '''
     3) 'ab'
     4) '',''
     5) 'cd'
     6) '',''
     7) 'eg'
     8) '''
     9) ''

Test case #11:
  Text:    'boo:and:foo'
  Pattern: /:/
  Parts:
     1) 'boo'
     2) ':'
     3) 'and'
     4) ':'
     5) 'foo'

Test case #12:
  Text:    'boo:and:foo'
  Pattern: /o/
  Parts:
     1) 'b'
     2) 'o'
     3) ''
     4) 'o'
     5) ':and:f'
     6) 'o'
     7) ''
     8) 'o'
     9) ''

Test case #13:
  Text:    'boo:and:foo'
  Pattern: /o+/
  Parts:
     1) 'b'
     2) 'oo'
     3) ':and:f'
     4) 'oo'
     5) ''
    

Comments

Snippets Manager replied on Mon, 2009/01/26 - 7:42am

Thank you very, very much for this.