source: src/main/java/agents/org/apache/commons/lang/text/StrTokenizer.java

Last change on this file was 127, checked in by Wouter Pasman, 6 years ago

#41 ROLL BACK of rev.126 . So this version is equal to rev. 125

File size: 38.5 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17package agents.org.apache.commons.lang.text;
18
19import java.util.ArrayList;
20import java.util.Collections;
21import java.util.List;
22import java.util.ListIterator;
23import java.util.NoSuchElementException;
24
25/**
26 * Tokenizes a string based based on delimiters (separators)
27 * and supporting quoting and ignored character concepts.
28 * <p>
29 * This class can split a String into many smaller strings. It aims
30 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
31 * however it offers much more control and flexibility including implementing
32 * the <code>ListIterator</code> interface. By default, it is set up
33 * like <code>StringTokenizer</code>.
34 * <p>
35 * The input String is split into a number of <i>tokens</i>.
36 * Each token is separated from the next String by a <i>delimiter</i>.
37 * One or more delimiter characters must be specified.
38 * <p>
39 * Each token may be surrounded by quotes.
40 * The <i>quote</i> matcher specifies the quote character(s).
41 * A quote may be escaped within a quoted section by duplicating itself.
42 * <p>
43 * Between each token and the delimiter are potentially characters that need trimming.
44 * The <i>trimmer</i> matcher specifies these characters.
45 * One usage might be to trim whitespace characters.
46 * <p>
47 * At any point outside the quotes there might potentially be invalid characters.
48 * The <i>ignored</i> matcher specifies these characters to be removed.
49 * One usage might be to remove new line characters.
50 * <p>
51 * Empty tokens may be removed or returned as null.
52 * <pre>
53 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
54 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
55 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56 * </pre>
57 * <p>
58 *
59 * This tokenizer has the following properties and options:
60 *
61 * <table>
62 * <tr>
63 * <th>Property</th><th>Type</th><th>Default</th>
64 * </tr>
65 * <tr>
66 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
67 * </tr>
68 * <tr>
69 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
70 * </tr>
71 * <tr>
72 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
73 * </tr>
74 * <tr>
75 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
76 * </tr>
77 * <tr>
78 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
79 * </tr>
80 * </table>
81 *
82 * @author Apache Software Foundation
83 * @author Matthew Inger
84 * @author Gary D. Gregory
85 * @since 2.2
86 * @version $Id: StrTokenizer.java 907631 2010-02-08 12:22:48Z sebb $
87 */
88public class StrTokenizer implements ListIterator, Cloneable {
89
90 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
91 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
92 static {
93 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
94 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
95 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
96 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
97 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
98 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
99 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100
101 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108 }
109
110 /** The text to work on. */
111 private char chars[];
112 /** The parsed tokens */
113 private String tokens[];
114 /** The current iteration position */
115 private int tokenPos;
116
117 /** The delimiter matcher */
118 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119 /** The quote matcher */
120 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121 /** The ignored matcher */
122 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123 /** The trimmer matcher */
124 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125
126 /** Whether to return empty tokens as null */
127 private boolean emptyAsNull = false;
128 /** Whether to ignore empty tokens */
129 private boolean ignoreEmptyTokens = true;
130
131 //-----------------------------------------------------------------------
132
133 /**
134 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135 *
136 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137 */
138 private static StrTokenizer getCSVClone() {
139 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140 }
141
142 /**
143 * Gets a new tokenizer instance which parses Comma Separated Value strings
144 * initializing it with the given input. The default for CSV processing
145 * will be trim whitespace from both ends (which can be overridden with
146 * the setTrimmer method).
147 * <p>
148 * You must call a "reset" method to set the string which you want to parse.
149 * @return a new tokenizer instance which parses Comma Separated Value strings
150 */
151 public static StrTokenizer getCSVInstance() {
152 return getCSVClone();
153 }
154
155 /**
156 * Gets a new tokenizer instance which parses Comma Separated Value strings
157 * initializing it with the given input. The default for CSV processing
158 * will be trim whitespace from both ends (which can be overridden with
159 * the setTrimmer method).
160 *
161 * @param input the text to parse
162 * @return a new tokenizer instance which parses Comma Separated Value strings
163 */
164 public static StrTokenizer getCSVInstance(String input) {
165 StrTokenizer tok = getCSVClone();
166 tok.reset(input);
167 return tok;
168 }
169
170 /**
171 * Gets a new tokenizer instance which parses Comma Separated Value strings
172 * initializing it with the given input. The default for CSV processing
173 * will be trim whitespace from both ends (which can be overridden with
174 * the setTrimmer method).
175 *
176 * @param input the text to parse
177 * @return a new tokenizer instance which parses Comma Separated Value strings
178 */
179 public static StrTokenizer getCSVInstance(char[] input) {
180 StrTokenizer tok = getCSVClone();
181 tok.reset(input);
182 return tok;
183 }
184
185 /**
186 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187 *
188 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189 */
190 private static StrTokenizer getTSVClone() {
191 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192 }
193
194
195 /**
196 * Gets a new tokenizer instance which parses Tab Separated Value strings.
197 * The default for CSV processing will be trim whitespace from both ends
198 * (which can be overridden with the setTrimmer method).
199 * <p>
200 * You must call a "reset" method to set the string which you want to parse.
201 * @return a new tokenizer instance which parses Tab Separated Value strings.
202 */
203 public static StrTokenizer getTSVInstance() {
204 return getTSVClone();
205 }
206
207 /**
208 * Gets a new tokenizer instance which parses Tab Separated Value strings.
209 * The default for CSV processing will be trim whitespace from both ends
210 * (which can be overridden with the setTrimmer method).
211 * @param input the string to parse
212 * @return a new tokenizer instance which parses Tab Separated Value strings.
213 */
214 public static StrTokenizer getTSVInstance(String input) {
215 StrTokenizer tok = getTSVClone();
216 tok.reset(input);
217 return tok;
218 }
219
220 /**
221 * Gets a new tokenizer instance which parses Tab Separated Value strings.
222 * The default for CSV processing will be trim whitespace from both ends
223 * (which can be overridden with the setTrimmer method).
224 * @param input the string to parse
225 * @return a new tokenizer instance which parses Tab Separated Value strings.
226 */
227 public static StrTokenizer getTSVInstance(char[] input) {
228 StrTokenizer tok = getTSVClone();
229 tok.reset(input);
230 return tok;
231 }
232
233 //-----------------------------------------------------------------------
234 /**
235 * Constructs a tokenizer splitting on space, tab, newline and formfeed
236 * as per StringTokenizer, but with no text to tokenize.
237 * <p>
238 * This constructor is normally used with {@link #reset(String)}.
239 */
240 public StrTokenizer() {
241 super();
242 this.chars = null;
243 }
244
245 /**
246 * Constructs a tokenizer splitting on space, tab, newline and formfeed
247 * as per StringTokenizer.
248 *
249 * @param input the string which is to be parsed
250 */
251 public StrTokenizer(String input) {
252 super();
253 if (input != null) {
254 chars = input.toCharArray();
255 } else {
256 chars = null;
257 }
258 }
259
260 /**
261 * Constructs a tokenizer splitting on the specified delimiter character.
262 *
263 * @param input the string which is to be parsed
264 * @param delim the field delimiter character
265 */
266 public StrTokenizer(String input, char delim) {
267 this(input);
268 setDelimiterChar(delim);
269 }
270
271 /**
272 * Constructs a tokenizer splitting on the specified delimiter string.
273 *
274 * @param input the string which is to be parsed
275 * @param delim the field delimiter string
276 */
277 public StrTokenizer(String input, String delim) {
278 this(input);
279 setDelimiterString(delim);
280 }
281
282 /**
283 * Constructs a tokenizer splitting using the specified delimiter matcher.
284 *
285 * @param input the string which is to be parsed
286 * @param delim the field delimiter matcher
287 */
288 public StrTokenizer(String input, StrMatcher delim) {
289 this(input);
290 setDelimiterMatcher(delim);
291 }
292
293 /**
294 * Constructs a tokenizer splitting on the specified delimiter character
295 * and handling quotes using the specified quote character.
296 *
297 * @param input the string which is to be parsed
298 * @param delim the field delimiter character
299 * @param quote the field quoted string character
300 */
301 public StrTokenizer(String input, char delim, char quote) {
302 this(input, delim);
303 setQuoteChar(quote);
304 }
305
306 /**
307 * Constructs a tokenizer splitting using the specified delimiter matcher
308 * and handling quotes using the specified quote matcher.
309 *
310 * @param input the string which is to be parsed
311 * @param delim the field delimiter matcher
312 * @param quote the field quoted string matcher
313 */
314 public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
315 this(input, delim);
316 setQuoteMatcher(quote);
317 }
318
319 /**
320 * Constructs a tokenizer splitting on space, tab, newline and formfeed
321 * as per StringTokenizer.
322 * <p>
323 * The input character array is not cloned, and must not be altered after
324 * passing in to this method.
325 *
326 * @param input the string which is to be parsed, not cloned
327 */
328 public StrTokenizer(char[] input) {
329 super();
330 this.chars = input;
331 }
332
333 /**
334 * Constructs a tokenizer splitting on the specified character.
335 * <p>
336 * The input character array is not cloned, and must not be altered after
337 * passing in to this method.
338 *
339 * @param input the string which is to be parsed, not cloned
340 * @param delim the field delimiter character
341 */
342 public StrTokenizer(char[] input, char delim) {
343 this(input);
344 setDelimiterChar(delim);
345 }
346
347 /**
348 * Constructs a tokenizer splitting on the specified string.
349 * <p>
350 * The input character array is not cloned, and must not be altered after
351 * passing in to this method.
352 *
353 * @param input the string which is to be parsed, not cloned
354 * @param delim the field delimiter string
355 */
356 public StrTokenizer(char[] input, String delim) {
357 this(input);
358 setDelimiterString(delim);
359 }
360
361 /**
362 * Constructs a tokenizer splitting using the specified delimiter matcher.
363 * <p>
364 * The input character array is not cloned, and must not be altered after
365 * passing in to this method.
366 *
367 * @param input the string which is to be parsed, not cloned
368 * @param delim the field delimiter matcher
369 */
370 public StrTokenizer(char[] input, StrMatcher delim) {
371 this(input);
372 setDelimiterMatcher(delim);
373 }
374
375 /**
376 * Constructs a tokenizer splitting on the specified delimiter character
377 * and handling quotes using the specified quote character.
378 * <p>
379 * The input character array is not cloned, and must not be altered after
380 * passing in to this method.
381 *
382 * @param input the string which is to be parsed, not cloned
383 * @param delim the field delimiter character
384 * @param quote the field quoted string character
385 */
386 public StrTokenizer(char[] input, char delim, char quote) {
387 this(input, delim);
388 setQuoteChar(quote);
389 }
390
391 /**
392 * Constructs a tokenizer splitting using the specified delimiter matcher
393 * and handling quotes using the specified quote matcher.
394 * <p>
395 * The input character array is not cloned, and must not be altered after
396 * passing in to this method.
397 *
398 * @param input the string which is to be parsed, not cloned
399 * @param delim the field delimiter character
400 * @param quote the field quoted string character
401 */
402 public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
403 this(input, delim);
404 setQuoteMatcher(quote);
405 }
406
407 // API
408 //-----------------------------------------------------------------------
409 /**
410 * Gets the number of tokens found in the String.
411 *
412 * @return the number of matched tokens
413 */
414 public int size() {
415 checkTokenized();
416 return tokens.length;
417 }
418
419 /**
420 * Gets the next token from the String.
421 * Equivalent to {@link #next()} except it returns null rather than
422 * throwing {@link NoSuchElementException} when no tokens remain.
423 *
424 * @return the next sequential token, or null when no more tokens are found
425 */
426 public String nextToken() {
427 if (hasNext()) {
428 return tokens[tokenPos++];
429 }
430 return null;
431 }
432
433 /**
434 * Gets the previous token from the String.
435 *
436 * @return the previous sequential token, or null when no more tokens are found
437 */
438 public String previousToken() {
439 if (hasPrevious()) {
440 return tokens[--tokenPos];
441 }
442 return null;
443 }
444
445 /**
446 * Gets a copy of the full token list as an independent modifiable array.
447 *
448 * @return the tokens as a String array
449 */
450 public String[] getTokenArray() {
451 checkTokenized();
452 return (String[]) tokens.clone();
453 }
454
455 /**
456 * Gets a copy of the full token list as an independent modifiable list.
457 *
458 * @return the tokens as a String array
459 */
460 public List getTokenList() {
461 checkTokenized();
462 List list = new ArrayList(tokens.length);
463 for (int i = 0; i < tokens.length; i++) {
464 list.add(tokens[i]);
465 }
466 return list;
467 }
468
469 /**
470 * Resets this tokenizer, forgetting all parsing and iteration already completed.
471 * <p>
472 * This method allows the same tokenizer to be reused for the same String.
473 *
474 * @return this, to enable chaining
475 */
476 public StrTokenizer reset() {
477 tokenPos = 0;
478 tokens = null;
479 return this;
480 }
481
482 /**
483 * Reset this tokenizer, giving it a new input string to parse.
484 * In this manner you can re-use a tokenizer with the same settings
485 * on multiple input lines.
486 *
487 * @param input the new string to tokenize, null sets no text to parse
488 * @return this, to enable chaining
489 */
490 public StrTokenizer reset(String input) {
491 reset();
492 if (input != null) {
493 this.chars = input.toCharArray();
494 } else {
495 this.chars = null;
496 }
497 return this;
498 }
499
500 /**
501 * Reset this tokenizer, giving it a new input string to parse.
502 * In this manner you can re-use a tokenizer with the same settings
503 * on multiple input lines.
504 * <p>
505 * The input character array is not cloned, and must not be altered after
506 * passing in to this method.
507 *
508 * @param input the new character array to tokenize, not cloned, null sets no text to parse
509 * @return this, to enable chaining
510 */
511 public StrTokenizer reset(char[] input) {
512 reset();
513 this.chars = input;
514 return this;
515 }
516
517 // ListIterator
518 //-----------------------------------------------------------------------
519 /**
520 * Checks whether there are any more tokens.
521 *
522 * @return true if there are more tokens
523 */
524 public boolean hasNext() {
525 checkTokenized();
526 return tokenPos < tokens.length;
527 }
528
529 /**
530 * Gets the next token.
531 *
532 * @return the next String token
533 * @throws NoSuchElementException if there are no more elements
534 */
535 public Object next() {
536 if (hasNext()) {
537 return tokens[tokenPos++];
538 }
539 throw new NoSuchElementException();
540 }
541
542 /**
543 * Gets the index of the next token to return.
544 *
545 * @return the next token index
546 */
547 public int nextIndex() {
548 return tokenPos;
549 }
550
551 /**
552 * Checks whether there are any previous tokens that can be iterated to.
553 *
554 * @return true if there are previous tokens
555 */
556 public boolean hasPrevious() {
557 checkTokenized();
558 return tokenPos > 0;
559 }
560
561 /**
562 * Gets the token previous to the last returned token.
563 *
564 * @return the previous token
565 */
566 public Object previous() {
567 if (hasPrevious()) {
568 return tokens[--tokenPos];
569 }
570 throw new NoSuchElementException();
571 }
572
573 /**
574 * Gets the index of the previous token.
575 *
576 * @return the previous token index
577 */
578 public int previousIndex() {
579 return tokenPos - 1;
580 }
581
582 /**
583 * Unsupported ListIterator operation.
584 *
585 * @throws UnsupportedOperationException always
586 */
587 public void remove() {
588 throw new UnsupportedOperationException("remove() is unsupported");
589 }
590
591 /**
592 * Unsupported ListIterator operation.
593 * @param obj this parameter ignored.
594 * @throws UnsupportedOperationException always
595 */
596 public void set(Object obj) {
597 throw new UnsupportedOperationException("set() is unsupported");
598 }
599
600 /**
601 * Unsupported ListIterator operation.
602 * @param obj this parameter ignored.
603 * @throws UnsupportedOperationException always
604 */
605 public void add(Object obj) {
606 throw new UnsupportedOperationException("add() is unsupported");
607 }
608
609 // Implementation
610 //-----------------------------------------------------------------------
611 /**
612 * Checks if tokenization has been done, and if not then do it.
613 */
614 private void checkTokenized() {
615 if (tokens == null) {
616 if (chars == null) {
617 // still call tokenize as subclass may do some work
618 List split = tokenize(null, 0, 0);
619 tokens = (String[]) split.toArray(new String[split.size()]);
620 } else {
621 List split = tokenize(chars, 0, chars.length);
622 tokens = (String[]) split.toArray(new String[split.size()]);
623 }
624 }
625 }
626
627 /**
628 * Internal method to performs the tokenization.
629 * <p>
630 * Most users of this class do not need to call this method. This method
631 * will be called automatically by other (public) methods when required.
632 * <p>
633 * This method exists to allow subclasses to add code before or after the
634 * tokenization. For example, a subclass could alter the character array,
635 * offset or count to be parsed, or call the tokenizer multiple times on
636 * multiple strings. It is also be possible to filter the results.
637 * <p>
638 * <code>StrTokenizer</code> will always pass a zero offset and a count
639 * equal to the length of the array to this method, however a subclass
640 * may pass other values, or even an entirely different array.
641 *
642 * @param chars the character array being tokenized, may be null
643 * @param offset the start position within the character array, must be valid
644 * @param count the number of characters to tokenize, must be valid
645 * @return the modifiable list of String tokens, unmodifiable if null array or zero count
646 */
647 protected List tokenize(char[] chars, int offset, int count) {
648 if (chars == null || count == 0) {
649 return Collections.EMPTY_LIST;
650 }
651 StrBuilder buf = new StrBuilder();
652 List tokens = new ArrayList();
653 int pos = offset;
654
655 // loop around the entire buffer
656 while (pos >= 0 && pos < count) {
657 // find next token
658 pos = readNextToken(chars, pos, count, buf, tokens);
659
660 // handle case where end of string is a delimiter
661 if (pos >= count) {
662 addToken(tokens, "");
663 }
664 }
665 return tokens;
666 }
667
668 /**
669 * Adds a token to a list, paying attention to the parameters we've set.
670 *
671 * @param list the list to add to
672 * @param tok the token to add
673 */
674 private void addToken(List list, String tok) {
675 if (tok == null || tok.length() == 0) {
676 if (isIgnoreEmptyTokens()) {
677 return;
678 }
679 if (isEmptyTokenAsNull()) {
680 tok = null;
681 }
682 }
683 list.add(tok);
684 }
685
686 /**
687 * Reads character by character through the String to get the next token.
688 *
689 * @param chars the character array being tokenized
690 * @param start the first character of field
691 * @param len the length of the character array being tokenized
692 * @param workArea a temporary work area
693 * @param tokens the list of parsed tokens
694 * @return the starting position of the next field (the character
695 * immediately after the delimiter), or -1 if end of string found
696 */
697 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List tokens) {
698 // skip all leading whitespace, unless it is the
699 // field delimiter or the quote character
700 while (start < len) {
701 int removeLen = Math.max(
702 getIgnoredMatcher().isMatch(chars, start, start, len),
703 getTrimmerMatcher().isMatch(chars, start, start, len));
704 if (removeLen == 0 ||
705 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
706 getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
707 break;
708 }
709 start += removeLen;
710 }
711
712 // handle reaching end
713 if (start >= len) {
714 addToken(tokens, "");
715 return -1;
716 }
717
718 // handle empty token
719 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
720 if (delimLen > 0) {
721 addToken(tokens, "");
722 return start + delimLen;
723 }
724
725 // handle found token
726 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
727 if (quoteLen > 0) {
728 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
729 }
730 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
731 }
732
733 /**
734 * Reads a possibly quoted string token.
735 *
736 * @param chars the character array being tokenized
737 * @param start the first character of field
738 * @param len the length of the character array being tokenized
739 * @param workArea a temporary work area
740 * @param tokens the list of parsed tokens
741 * @param quoteStart the start position of the matched quote, 0 if no quoting
742 * @param quoteLen the length of the matched quote, 0 if no quoting
743 * @return the starting position of the next field (the character
744 * immediately after the delimiter, or if end of string found,
745 * then the length of string
746 */
747 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea,
748 List tokens, int quoteStart, int quoteLen)
749 {
750 // Loop until we've found the end of the quoted
751 // string or the end of the input
752 workArea.clear();
753 int pos = start;
754 boolean quoting = (quoteLen > 0);
755 int trimStart = 0;
756
757 while (pos < len) {
758 // quoting mode can occur several times throughout a string
759 // we must switch between quoting and non-quoting until we
760 // encounter a non-quoted delimiter, or end of string
761 if (quoting) {
762 // In quoting mode
763
764 // If we've found a quote character, see if it's
765 // followed by a second quote. If so, then we need
766 // to actually put the quote character into the token
767 // rather than end the token.
768 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
769 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
770 // matched pair of quotes, thus an escaped quote
771 workArea.append(chars, pos, quoteLen);
772 pos += (quoteLen * 2);
773 trimStart = workArea.size();
774 continue;
775 }
776
777 // end of quoting
778 quoting = false;
779 pos += quoteLen;
780 continue;
781 }
782
783 // copy regular character from inside quotes
784 workArea.append(chars[pos++]);
785 trimStart = workArea.size();
786
787 } else {
788 // Not in quoting mode
789
790 // check for delimiter, and thus end of token
791 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
792 if (delimLen > 0) {
793 // return condition when end of token found
794 addToken(tokens, workArea.substring(0, trimStart));
795 return pos + delimLen;
796 }
797
798 // check for quote, and thus back into quoting mode
799 if (quoteLen > 0) {
800 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
801 quoting = true;
802 pos += quoteLen;
803 continue;
804 }
805 }
806
807 // check for ignored (outside quotes), and ignore
808 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
809 if (ignoredLen > 0) {
810 pos += ignoredLen;
811 continue;
812 }
813
814 // check for trimmed character
815 // don't yet know if its at the end, so copy to workArea
816 // use trimStart to keep track of trim at the end
817 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
818 if (trimmedLen > 0) {
819 workArea.append(chars, pos, trimmedLen);
820 pos += trimmedLen;
821 continue;
822 }
823
824 // copy regular character from outside quotes
825 workArea.append(chars[pos++]);
826 trimStart = workArea.size();
827 }
828 }
829
830 // return condition when end of string found
831 addToken(tokens, workArea.substring(0, trimStart));
832 return -1;
833 }
834
835 /**
836 * Checks if the characters at the index specified match the quote
837 * already matched in readNextToken().
838 *
839 * @param chars the character array being tokenized
840 * @param pos the position to check for a quote
841 * @param len the length of the character array being tokenized
842 * @param quoteStart the start position of the matched quote, 0 if no quoting
843 * @param quoteLen the length of the matched quote, 0 if no quoting
844 * @return true if a quote is matched
845 */
846 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
847 for (int i = 0; i < quoteLen; i++) {
848 if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
849 return false;
850 }
851 }
852 return true;
853 }
854
855 // Delimiter
856 //-----------------------------------------------------------------------
857 /**
858 * Gets the field delimiter matcher.
859 *
860 * @return the delimiter matcher in use
861 */
862 public StrMatcher getDelimiterMatcher() {
863 return this.delimMatcher;
864 }
865
866 /**
867 * Sets the field delimiter matcher.
868 * <p>
869 * The delimitier is used to separate one token from another.
870 *
871 * @param delim the delimiter matcher to use
872 * @return this, to enable chaining
873 */
874 public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
875 if (delim == null) {
876 this.delimMatcher = StrMatcher.noneMatcher();
877 } else {
878 this.delimMatcher = delim;
879 }
880 return this;
881 }
882
883 /**
884 * Sets the field delimiter character.
885 *
886 * @param delim the delimiter character to use
887 * @return this, to enable chaining
888 */
889 public StrTokenizer setDelimiterChar(char delim) {
890 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
891 }
892
893 /**
894 * Sets the field delimiter string.
895 *
896 * @param delim the delimiter string to use
897 * @return this, to enable chaining
898 */
899 public StrTokenizer setDelimiterString(String delim) {
900 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
901 }
902
903 // Quote
904 //-----------------------------------------------------------------------
905 /**
906 * Gets the quote matcher currently in use.
907 * <p>
908 * The quote character is used to wrap data between the tokens.
909 * This enables delimiters to be entered as data.
910 * The default value is '"' (double quote).
911 *
912 * @return the quote matcher in use
913 */
914 public StrMatcher getQuoteMatcher() {
915 return quoteMatcher;
916 }
917
918 /**
919 * Set the quote matcher to use.
920 * <p>
921 * The quote character is used to wrap data between the tokens.
922 * This enables delimiters to be entered as data.
923 *
924 * @param quote the quote matcher to use, null ignored
925 * @return this, to enable chaining
926 */
927 public StrTokenizer setQuoteMatcher(StrMatcher quote) {
928 if (quote != null) {
929 this.quoteMatcher = quote;
930 }
931 return this;
932 }
933
934 /**
935 * Sets the quote character to use.
936 * <p>
937 * The quote character is used to wrap data between the tokens.
938 * This enables delimiters to be entered as data.
939 *
940 * @param quote the quote character to use
941 * @return this, to enable chaining
942 */
943 public StrTokenizer setQuoteChar(char quote) {
944 return setQuoteMatcher(StrMatcher.charMatcher(quote));
945 }
946
947 // Ignored
948 //-----------------------------------------------------------------------
949 /**
950 * Gets the ignored character matcher.
951 * <p>
952 * These characters are ignored when parsing the String, unless they are
953 * within a quoted region.
954 * The default value is not to ignore anything.
955 *
956 * @return the ignored matcher in use
957 */
958 public StrMatcher getIgnoredMatcher() {
959 return ignoredMatcher;
960 }
961
962 /**
963 * Set the matcher for characters to ignore.
964 * <p>
965 * These characters are ignored when parsing the String, unless they are
966 * within a quoted region.
967 *
968 * @param ignored the ignored matcher to use, null ignored
969 * @return this, to enable chaining
970 */
971 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
972 if (ignored != null) {
973 this.ignoredMatcher = ignored;
974 }
975 return this;
976 }
977
978 /**
979 * Set the character to ignore.
980 * <p>
981 * This character is ignored when parsing the String, unless it is
982 * within a quoted region.
983 *
984 * @param ignored the ignored character to use
985 * @return this, to enable chaining
986 */
987 public StrTokenizer setIgnoredChar(char ignored) {
988 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
989 }
990
991 // Trimmer
992 //-----------------------------------------------------------------------
993 /**
994 * Gets the trimmer character matcher.
995 * <p>
996 * These characters are trimmed off on each side of the delimiter
997 * until the token or quote is found.
998 * The default value is not to trim anything.
999 *
1000 * @return the trimmer matcher in use
1001 */
1002 public StrMatcher getTrimmerMatcher() {
1003 return trimmerMatcher;
1004 }
1005
1006 /**
1007 * Sets the matcher for characters to trim.
1008 * <p>
1009 * These characters are trimmed off on each side of the delimiter
1010 * until the token or quote is found.
1011 *
1012 * @param trimmer the trimmer matcher to use, null ignored
1013 * @return this, to enable chaining
1014 */
1015 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
1016 if (trimmer != null) {
1017 this.trimmerMatcher = trimmer;
1018 }
1019 return this;
1020 }
1021
1022 //-----------------------------------------------------------------------
1023 /**
1024 * Gets whether the tokenizer currently returns empty tokens as null.
1025 * The default for this property is false.
1026 *
1027 * @return true if empty tokens are returned as null
1028 */
1029 public boolean isEmptyTokenAsNull() {
1030 return this.emptyAsNull;
1031 }
1032
1033 /**
1034 * Sets whether the tokenizer should return empty tokens as null.
1035 * The default for this property is false.
1036 *
1037 * @param emptyAsNull whether empty tokens are returned as null
1038 * @return this, to enable chaining
1039 */
1040 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1041 this.emptyAsNull = emptyAsNull;
1042 return this;
1043 }
1044
1045 //-----------------------------------------------------------------------
1046 /**
1047 * Gets whether the tokenizer currently ignores empty tokens.
1048 * The default for this property is true.
1049 *
1050 * @return true if empty tokens are not returned
1051 */
1052 public boolean isIgnoreEmptyTokens() {
1053 return ignoreEmptyTokens;
1054 }
1055
1056 /**
1057 * Sets whether the tokenizer should ignore and not return empty tokens.
1058 * The default for this property is true.
1059 *
1060 * @param ignoreEmptyTokens whether empty tokens are not returned
1061 * @return this, to enable chaining
1062 */
1063 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1064 this.ignoreEmptyTokens = ignoreEmptyTokens;
1065 return this;
1066 }
1067
1068 //-----------------------------------------------------------------------
1069 /**
1070 * Gets the String content that the tokenizer is parsing.
1071 *
1072 * @return the string content being parsed
1073 */
1074 public String getContent() {
1075 if (chars == null) {
1076 return null;
1077 }
1078 return new String(chars);
1079 }
1080
1081 //-----------------------------------------------------------------------
1082 /**
1083 * Creates a new instance of this Tokenizer. The new instance is reset so
1084 * that it will be at the start of the token list.
1085 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1086 *
1087 * @return a new instance of this Tokenizer which has been reset.
1088 */
1089 public Object clone() {
1090 try {
1091 return cloneReset();
1092 } catch (CloneNotSupportedException ex) {
1093 return null;
1094 }
1095 }
1096
1097 /**
1098 * Creates a new instance of this Tokenizer. The new instance is reset so that
1099 * it will be at the start of the token list.
1100 *
1101 * @return a new instance of this Tokenizer which has been reset.
1102 * @throws CloneNotSupportedException if there is a problem cloning
1103 */
1104 Object cloneReset() throws CloneNotSupportedException {
1105 // this method exists to enable 100% test coverage
1106 StrTokenizer cloned = (StrTokenizer) super.clone();
1107 if (cloned.chars != null) {
1108 cloned.chars = (char[]) cloned.chars.clone();
1109 }
1110 cloned.reset();
1111 return cloned;
1112 }
1113
1114 //-----------------------------------------------------------------------
1115 /**
1116 * Gets the String content that the tokenizer is parsing.
1117 *
1118 * @return the string content being parsed
1119 */
1120 public String toString() {
1121 if (tokens == null) {
1122 return "StrTokenizer[not tokenized yet]";
1123 }
1124 return "StrTokenizer" + getTokenList();
1125 }
1126
1127}
Note: See TracBrowser for help on using the repository browser.