Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

StrTokenizer.java

Last change on this file was 127, checked in by Wouter Pasman, 6 years ago
#41 ROLL BACK of rev.126 . So this version is equal to rev. 125
File size: 38.5 KB

Line
1	/*
2	* Licensed to the Apache Software Foundation (ASF) under one or more
3	* contributor license agreements. See the NOTICE file distributed with
4	* this work for additional information regarding copyright ownership.
5	* The ASF licenses this file to You under the Apache License, Version 2.0
6	* (the "License"); you may not use this file except in compliance with
7	* the License. You may obtain a copy of the License at
8	*
9	* http://www.apache.org/licenses/LICENSE-2.0
10	*
11	* Unless required by applicable law or agreed to in writing, software
12	* distributed under the License is distributed on an "AS IS" BASIS,
13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14	* See the License for the specific language governing permissions and
15	* limitations under the License.
16	*/
17	package agents.org.apache.commons.lang.text;
18
19	import java.util.ArrayList;
20	import java.util.Collections;
21	import java.util.List;
22	import java.util.ListIterator;
23	import java.util.NoSuchElementException;
24
25	/**
26	* Tokenizes a string based based on delimiters (separators)
27	* and supporting quoting and ignored character concepts.
28	* <p>
29	* This class can split a String into many smaller strings. It aims
30	* to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
31	* however it offers much more control and flexibility including implementing
32	* the <code>ListIterator</code> interface. By default, it is set up
33	* like <code>StringTokenizer</code>.
34	* <p>
35	* The input String is split into a number of <i>tokens</i>.
36	* Each token is separated from the next String by a <i>delimiter</i>.
37	* One or more delimiter characters must be specified.
38	* <p>
39	* Each token may be surrounded by quotes.
40	* The <i>quote</i> matcher specifies the quote character(s).
41	* A quote may be escaped within a quoted section by duplicating itself.
42	* <p>
43	* Between each token and the delimiter are potentially characters that need trimming.
44	* The <i>trimmer</i> matcher specifies these characters.
45	* One usage might be to trim whitespace characters.
46	* <p>
47	* At any point outside the quotes there might potentially be invalid characters.
48	* The <i>ignored</i> matcher specifies these characters to be removed.
49	* One usage might be to remove new line characters.
50	* <p>
51	* Empty tokens may be removed or returned as null.
52	* <pre>
53	* "a,b,c" - Three tokens "a","b","c" (comma delimiter)
54	* " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
55	* "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56	* </pre>
57	* <p>
58	*
59	* This tokenizer has the following properties and options:
60	*
61	* <table>
62	* <tr>
63	* <th>Property</th><th>Type</th><th>Default</th>
64	* </tr>
65	* <tr>
66	* <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
67	* </tr>
68	* <tr>
69	* <td>quote</td><td>NoneMatcher</td><td>{}</td>
70	* </tr>
71	* <tr>
72	* <td>ignore</td><td>NoneMatcher</td><td>{}</td>
73	* </tr>
74	* <tr>
75	* <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
76	* </tr>
77	* <tr>
78	* <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
79	* </tr>
80	* </table>
81	*
82	* @author Apache Software Foundation
83	* @author Matthew Inger
84	* @author Gary D. Gregory
85	* @since 2.2
86	* @version $Id: StrTokenizer.java 907631 2010-02-08 12:22:48Z sebb $
87	*/
88	public class StrTokenizer implements ListIterator, Cloneable {
89
90	private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
91	private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
92	static {
93	CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
94	CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
95	CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
96	CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
97	CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
98	CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
99	CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100
101	TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102	TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103	TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104	TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105	TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106	TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107	TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108	}
109
110	/** The text to work on. */
111	private char chars[];
112	/** The parsed tokens */
113	private String tokens[];
114	/** The current iteration position */
115	private int tokenPos;
116
117	/** The delimiter matcher */
118	private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119	/** The quote matcher */
120	private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121	/** The ignored matcher */
122	private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123	/** The trimmer matcher */
124	private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125
126	/** Whether to return empty tokens as null */
127	private boolean emptyAsNull = false;
128	/** Whether to ignore empty tokens */
129	private boolean ignoreEmptyTokens = true;
130
131	//-----------------------------------------------------------------------
132
133	/**
134	* Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135	*
136	* @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137	*/
138	private static StrTokenizer getCSVClone() {
139	return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140	}
141
142	/**
143	* Gets a new tokenizer instance which parses Comma Separated Value strings
144	* initializing it with the given input. The default for CSV processing
145	* will be trim whitespace from both ends (which can be overridden with
146	* the setTrimmer method).
147	* <p>
148	* You must call a "reset" method to set the string which you want to parse.
149	* @return a new tokenizer instance which parses Comma Separated Value strings
150	*/
151	public static StrTokenizer getCSVInstance() {
152	return getCSVClone();
153	}
154
155	/**
156	* Gets a new tokenizer instance which parses Comma Separated Value strings
157	* initializing it with the given input. The default for CSV processing
158	* will be trim whitespace from both ends (which can be overridden with
159	* the setTrimmer method).
160	*
161	* @param input the text to parse
162	* @return a new tokenizer instance which parses Comma Separated Value strings
163	*/
164	public static StrTokenizer getCSVInstance(String input) {
165	StrTokenizer tok = getCSVClone();
166	tok.reset(input);
167	return tok;
168	}
169
170	/**
171	* Gets a new tokenizer instance which parses Comma Separated Value strings
172	* initializing it with the given input. The default for CSV processing
173	* will be trim whitespace from both ends (which can be overridden with
174	* the setTrimmer method).
175	*
176	* @param input the text to parse
177	* @return a new tokenizer instance which parses Comma Separated Value strings
178	*/
179	public static StrTokenizer getCSVInstance(char[] input) {
180	StrTokenizer tok = getCSVClone();
181	tok.reset(input);
182	return tok;
183	}
184
185	/**
186	* Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187	*
188	* @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189	*/
190	private static StrTokenizer getTSVClone() {
191	return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192	}
193
194
195	/**
196	* Gets a new tokenizer instance which parses Tab Separated Value strings.
197	* The default for CSV processing will be trim whitespace from both ends
198	* (which can be overridden with the setTrimmer method).
199	* <p>
200	* You must call a "reset" method to set the string which you want to parse.
201	* @return a new tokenizer instance which parses Tab Separated Value strings.
202	*/
203	public static StrTokenizer getTSVInstance() {
204	return getTSVClone();
205	}
206
207	/**
208	* Gets a new tokenizer instance which parses Tab Separated Value strings.
209	* The default for CSV processing will be trim whitespace from both ends
210	* (which can be overridden with the setTrimmer method).
211	* @param input the string to parse
212	* @return a new tokenizer instance which parses Tab Separated Value strings.
213	*/
214	public static StrTokenizer getTSVInstance(String input) {
215	StrTokenizer tok = getTSVClone();
216	tok.reset(input);
217	return tok;
218	}
219
220	/**
221	* Gets a new tokenizer instance which parses Tab Separated Value strings.
222	* The default for CSV processing will be trim whitespace from both ends
223	* (which can be overridden with the setTrimmer method).
224	* @param input the string to parse
225	* @return a new tokenizer instance which parses Tab Separated Value strings.
226	*/
227	public static StrTokenizer getTSVInstance(char[] input) {
228	StrTokenizer tok = getTSVClone();
229	tok.reset(input);
230	return tok;
231	}
232
233	//-----------------------------------------------------------------------
234	/**
235	* Constructs a tokenizer splitting on space, tab, newline and formfeed
236	* as per StringTokenizer, but with no text to tokenize.
237	* <p>
238	* This constructor is normally used with {@link #reset(String)}.
239	*/
240	public StrTokenizer() {
241	super();
242	this.chars = null;
243	}
244
245	/**
246	* Constructs a tokenizer splitting on space, tab, newline and formfeed
247	* as per StringTokenizer.
248	*
249	* @param input the string which is to be parsed
250	*/
251	public StrTokenizer(String input) {
252	super();
253	if (input != null) {
254	chars = input.toCharArray();
255	} else {
256	chars = null;
257	}
258	}
259
260	/**
261	* Constructs a tokenizer splitting on the specified delimiter character.
262	*
263	* @param input the string which is to be parsed
264	* @param delim the field delimiter character
265	*/
266	public StrTokenizer(String input, char delim) {
267	this(input);
268	setDelimiterChar(delim);
269	}
270
271	/**
272	* Constructs a tokenizer splitting on the specified delimiter string.
273	*
274	* @param input the string which is to be parsed
275	* @param delim the field delimiter string
276	*/
277	public StrTokenizer(String input, String delim) {
278	this(input);
279	setDelimiterString(delim);
280	}
281
282	/**
283	* Constructs a tokenizer splitting using the specified delimiter matcher.
284	*
285	* @param input the string which is to be parsed
286	* @param delim the field delimiter matcher
287	*/
288	public StrTokenizer(String input, StrMatcher delim) {
289	this(input);
290	setDelimiterMatcher(delim);
291	}
292
293	/**
294	* Constructs a tokenizer splitting on the specified delimiter character
295	* and handling quotes using the specified quote character.
296	*
297	* @param input the string which is to be parsed
298	* @param delim the field delimiter character
299	* @param quote the field quoted string character
300	*/
301	public StrTokenizer(String input, char delim, char quote) {
302	this(input, delim);
303	setQuoteChar(quote);
304	}
305
306	/**
307	* Constructs a tokenizer splitting using the specified delimiter matcher
308	* and handling quotes using the specified quote matcher.
309	*
310	* @param input the string which is to be parsed
311	* @param delim the field delimiter matcher
312	* @param quote the field quoted string matcher
313	*/
314	public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
315	this(input, delim);
316	setQuoteMatcher(quote);
317	}
318
319	/**
320	* Constructs a tokenizer splitting on space, tab, newline and formfeed
321	* as per StringTokenizer.
322	* <p>
323	* The input character array is not cloned, and must not be altered after
324	* passing in to this method.
325	*
326	* @param input the string which is to be parsed, not cloned
327	*/
328	public StrTokenizer(char[] input) {
329	super();
330	this.chars = input;
331	}
332
333	/**
334	* Constructs a tokenizer splitting on the specified character.
335	* <p>
336	* The input character array is not cloned, and must not be altered after
337	* passing in to this method.
338	*
339	* @param input the string which is to be parsed, not cloned
340	* @param delim the field delimiter character
341	*/
342	public StrTokenizer(char[] input, char delim) {
343	this(input);
344	setDelimiterChar(delim);
345	}
346
347	/**
348	* Constructs a tokenizer splitting on the specified string.
349	* <p>
350	* The input character array is not cloned, and must not be altered after
351	* passing in to this method.
352	*
353	* @param input the string which is to be parsed, not cloned
354	* @param delim the field delimiter string
355	*/
356	public StrTokenizer(char[] input, String delim) {
357	this(input);
358	setDelimiterString(delim);
359	}
360
361	/**
362	* Constructs a tokenizer splitting using the specified delimiter matcher.
363	* <p>
364	* The input character array is not cloned, and must not be altered after
365	* passing in to this method.
366	*
367	* @param input the string which is to be parsed, not cloned
368	* @param delim the field delimiter matcher
369	*/
370	public StrTokenizer(char[] input, StrMatcher delim) {
371	this(input);
372	setDelimiterMatcher(delim);
373	}
374
375	/**
376	* Constructs a tokenizer splitting on the specified delimiter character
377	* and handling quotes using the specified quote character.
378	* <p>
379	* The input character array is not cloned, and must not be altered after
380	* passing in to this method.
381	*
382	* @param input the string which is to be parsed, not cloned
383	* @param delim the field delimiter character
384	* @param quote the field quoted string character
385	*/
386	public StrTokenizer(char[] input, char delim, char quote) {
387	this(input, delim);
388	setQuoteChar(quote);
389	}
390
391	/**
392	* Constructs a tokenizer splitting using the specified delimiter matcher
393	* and handling quotes using the specified quote matcher.
394	* <p>
395	* The input character array is not cloned, and must not be altered after
396	* passing in to this method.
397	*
398	* @param input the string which is to be parsed, not cloned
399	* @param delim the field delimiter character
400	* @param quote the field quoted string character
401	*/
402	public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
403	this(input, delim);
404	setQuoteMatcher(quote);
405	}
406
407	// API
408	//-----------------------------------------------------------------------
409	/**
410	* Gets the number of tokens found in the String.
411	*
412	* @return the number of matched tokens
413	*/
414	public int size() {
415	checkTokenized();
416	return tokens.length;
417	}
418
419	/**
420	* Gets the next token from the String.
421	* Equivalent to {@link #next()} except it returns null rather than
422	* throwing {@link NoSuchElementException} when no tokens remain.
423	*
424	* @return the next sequential token, or null when no more tokens are found
425	*/
426	public String nextToken() {
427	if (hasNext()) {
428	return tokens[tokenPos++];
429	}
430	return null;
431	}
432
433	/**
434	* Gets the previous token from the String.
435	*
436	* @return the previous sequential token, or null when no more tokens are found
437	*/
438	public String previousToken() {
439	if (hasPrevious()) {
440	return tokens[--tokenPos];
441	}
442	return null;
443	}
444
445	/**
446	* Gets a copy of the full token list as an independent modifiable array.
447	*
448	* @return the tokens as a String array
449	*/
450	public String[] getTokenArray() {
451	checkTokenized();
452	return (String[]) tokens.clone();
453	}
454
455	/**
456	* Gets a copy of the full token list as an independent modifiable list.
457	*
458	* @return the tokens as a String array
459	*/
460	public List getTokenList() {
461	checkTokenized();
462	List list = new ArrayList(tokens.length);
463	for (int i = 0; i < tokens.length; i++) {
464	list.add(tokens[i]);
465	}
466	return list;
467	}
468
469	/**
470	* Resets this tokenizer, forgetting all parsing and iteration already completed.
471	* <p>
472	* This method allows the same tokenizer to be reused for the same String.
473	*
474	* @return this, to enable chaining
475	*/
476	public StrTokenizer reset() {
477	tokenPos = 0;
478	tokens = null;
479	return this;
480	}
481
482	/**
483	* Reset this tokenizer, giving it a new input string to parse.
484	* In this manner you can re-use a tokenizer with the same settings
485	* on multiple input lines.
486	*
487	* @param input the new string to tokenize, null sets no text to parse
488	* @return this, to enable chaining
489	*/
490	public StrTokenizer reset(String input) {
491	reset();
492	if (input != null) {
493	this.chars = input.toCharArray();
494	} else {
495	this.chars = null;
496	}
497	return this;
498	}
499
500	/**
501	* Reset this tokenizer, giving it a new input string to parse.
502	* In this manner you can re-use a tokenizer with the same settings
503	* on multiple input lines.
504	* <p>
505	* The input character array is not cloned, and must not be altered after
506	* passing in to this method.
507	*
508	* @param input the new character array to tokenize, not cloned, null sets no text to parse
509	* @return this, to enable chaining
510	*/
511	public StrTokenizer reset(char[] input) {
512	reset();
513	this.chars = input;
514	return this;
515	}
516
517	// ListIterator
518	//-----------------------------------------------------------------------
519	/**
520	* Checks whether there are any more tokens.
521	*
522	* @return true if there are more tokens
523	*/
524	public boolean hasNext() {
525	checkTokenized();
526	return tokenPos < tokens.length;
527	}
528
529	/**
530	* Gets the next token.
531	*
532	* @return the next String token
533	* @throws NoSuchElementException if there are no more elements
534	*/
535	public Object next() {
536	if (hasNext()) {
537	return tokens[tokenPos++];
538	}
539	throw new NoSuchElementException();
540	}
541
542	/**
543	* Gets the index of the next token to return.
544	*
545	* @return the next token index
546	*/
547	public int nextIndex() {
548	return tokenPos;
549	}
550
551	/**
552	* Checks whether there are any previous tokens that can be iterated to.
553	*
554	* @return true if there are previous tokens
555	*/
556	public boolean hasPrevious() {
557	checkTokenized();
558	return tokenPos > 0;
559	}
560
561	/**
562	* Gets the token previous to the last returned token.
563	*
564	* @return the previous token
565	*/
566	public Object previous() {
567	if (hasPrevious()) {
568	return tokens[--tokenPos];
569	}
570	throw new NoSuchElementException();
571	}
572
573	/**
574	* Gets the index of the previous token.
575	*
576	* @return the previous token index
577	*/
578	public int previousIndex() {
579	return tokenPos - 1;
580	}
581
582	/**
583	* Unsupported ListIterator operation.
584	*
585	* @throws UnsupportedOperationException always
586	*/
587	public void remove() {
588	throw new UnsupportedOperationException("remove() is unsupported");
589	}
590
591	/**
592	* Unsupported ListIterator operation.
593	* @param obj this parameter ignored.
594	* @throws UnsupportedOperationException always
595	*/
596	public void set(Object obj) {
597	throw new UnsupportedOperationException("set() is unsupported");
598	}
599
600	/**
601	* Unsupported ListIterator operation.
602	* @param obj this parameter ignored.
603	* @throws UnsupportedOperationException always
604	*/
605	public void add(Object obj) {
606	throw new UnsupportedOperationException("add() is unsupported");
607	}
608
609	// Implementation
610	//-----------------------------------------------------------------------
611	/**
612	* Checks if tokenization has been done, and if not then do it.
613	*/
614	private void checkTokenized() {
615	if (tokens == null) {
616	if (chars == null) {
617	// still call tokenize as subclass may do some work
618	List split = tokenize(null, 0, 0);
619	tokens = (String[]) split.toArray(new String[split.size()]);
620	} else {
621	List split = tokenize(chars, 0, chars.length);
622	tokens = (String[]) split.toArray(new String[split.size()]);
623	}
624	}
625	}
626
627	/**
628	* Internal method to performs the tokenization.
629	* <p>
630	* Most users of this class do not need to call this method. This method
631	* will be called automatically by other (public) methods when required.
632	* <p>
633	* This method exists to allow subclasses to add code before or after the
634	* tokenization. For example, a subclass could alter the character array,
635	* offset or count to be parsed, or call the tokenizer multiple times on
636	* multiple strings. It is also be possible to filter the results.
637	* <p>
638	* <code>StrTokenizer</code> will always pass a zero offset and a count
639	* equal to the length of the array to this method, however a subclass
640	* may pass other values, or even an entirely different array.
641	*
642	* @param chars the character array being tokenized, may be null
643	* @param offset the start position within the character array, must be valid
644	* @param count the number of characters to tokenize, must be valid
645	* @return the modifiable list of String tokens, unmodifiable if null array or zero count
646	*/
647	protected List tokenize(char[] chars, int offset, int count) {
648	if (chars == null \|\| count == 0) {
649	return Collections.EMPTY_LIST;
650	}
651	StrBuilder buf = new StrBuilder();
652	List tokens = new ArrayList();
653	int pos = offset;
654
655	// loop around the entire buffer
656	while (pos >= 0 && pos < count) {
657	// find next token
658	pos = readNextToken(chars, pos, count, buf, tokens);
659
660	// handle case where end of string is a delimiter
661	if (pos >= count) {
662	addToken(tokens, "");
663	}
664	}
665	return tokens;
666	}
667
668	/**
669	* Adds a token to a list, paying attention to the parameters we've set.
670	*
671	* @param list the list to add to
672	* @param tok the token to add
673	*/
674	private void addToken(List list, String tok) {
675	if (tok == null \|\| tok.length() == 0) {
676	if (isIgnoreEmptyTokens()) {
677	return;
678	}
679	if (isEmptyTokenAsNull()) {
680	tok = null;
681	}
682	}
683	list.add(tok);
684	}
685
686	/**
687	* Reads character by character through the String to get the next token.
688	*
689	* @param chars the character array being tokenized
690	* @param start the first character of field
691	* @param len the length of the character array being tokenized
692	* @param workArea a temporary work area
693	* @param tokens the list of parsed tokens
694	* @return the starting position of the next field (the character
695	* immediately after the delimiter), or -1 if end of string found
696	*/
697	private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List tokens) {
698	// skip all leading whitespace, unless it is the
699	// field delimiter or the quote character
700	while (start < len) {
701	int removeLen = Math.max(
702	getIgnoredMatcher().isMatch(chars, start, start, len),
703	getTrimmerMatcher().isMatch(chars, start, start, len));
704	if (removeLen == 0 \|\|
705	getDelimiterMatcher().isMatch(chars, start, start, len) > 0 \|\|
706	getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
707	break;
708	}
709	start += removeLen;
710	}
711
712	// handle reaching end
713	if (start >= len) {
714	addToken(tokens, "");
715	return -1;
716	}
717
718	// handle empty token
719	int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
720	if (delimLen > 0) {
721	addToken(tokens, "");
722	return start + delimLen;
723	}
724
725	// handle found token
726	int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
727	if (quoteLen > 0) {
728	return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
729	}
730	return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
731	}
732
733	/**
734	* Reads a possibly quoted string token.
735	*
736	* @param chars the character array being tokenized
737	* @param start the first character of field
738	* @param len the length of the character array being tokenized
739	* @param workArea a temporary work area
740	* @param tokens the list of parsed tokens
741	* @param quoteStart the start position of the matched quote, 0 if no quoting
742	* @param quoteLen the length of the matched quote, 0 if no quoting
743	* @return the starting position of the next field (the character
744	* immediately after the delimiter, or if end of string found,
745	* then the length of string
746	*/
747	private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea,
748	List tokens, int quoteStart, int quoteLen)
749	{
750	// Loop until we've found the end of the quoted
751	// string or the end of the input
752	workArea.clear();
753	int pos = start;
754	boolean quoting = (quoteLen > 0);
755	int trimStart = 0;
756
757	while (pos < len) {
758	// quoting mode can occur several times throughout a string
759	// we must switch between quoting and non-quoting until we
760	// encounter a non-quoted delimiter, or end of string
761	if (quoting) {
762	// In quoting mode
763
764	// If we've found a quote character, see if it's
765	// followed by a second quote. If so, then we need
766	// to actually put the quote character into the token
767	// rather than end the token.
768	if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
769	if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
770	// matched pair of quotes, thus an escaped quote
771	workArea.append(chars, pos, quoteLen);
772	pos += (quoteLen * 2);
773	trimStart = workArea.size();
774	continue;
775	}
776
777	// end of quoting
778	quoting = false;
779	pos += quoteLen;
780	continue;
781	}
782
783	// copy regular character from inside quotes
784	workArea.append(chars[pos++]);
785	trimStart = workArea.size();
786
787	} else {
788	// Not in quoting mode
789
790	// check for delimiter, and thus end of token
791	int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
792	if (delimLen > 0) {
793	// return condition when end of token found
794	addToken(tokens, workArea.substring(0, trimStart));
795	return pos + delimLen;
796	}
797
798	// check for quote, and thus back into quoting mode
799	if (quoteLen > 0) {
800	if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
801	quoting = true;
802	pos += quoteLen;
803	continue;
804	}
805	}
806
807	// check for ignored (outside quotes), and ignore
808	int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
809	if (ignoredLen > 0) {
810	pos += ignoredLen;
811	continue;
812	}
813
814	// check for trimmed character
815	// don't yet know if its at the end, so copy to workArea
816	// use trimStart to keep track of trim at the end
817	int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
818	if (trimmedLen > 0) {
819	workArea.append(chars, pos, trimmedLen);
820	pos += trimmedLen;
821	continue;
822	}
823
824	// copy regular character from outside quotes
825	workArea.append(chars[pos++]);
826	trimStart = workArea.size();
827	}
828	}
829
830	// return condition when end of string found
831	addToken(tokens, workArea.substring(0, trimStart));
832	return -1;
833	}
834
835	/**
836	* Checks if the characters at the index specified match the quote
837	* already matched in readNextToken().
838	*
839	* @param chars the character array being tokenized
840	* @param pos the position to check for a quote
841	* @param len the length of the character array being tokenized
842	* @param quoteStart the start position of the matched quote, 0 if no quoting
843	* @param quoteLen the length of the matched quote, 0 if no quoting
844	* @return true if a quote is matched
845	*/
846	private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
847	for (int i = 0; i < quoteLen; i++) {
848	if ((pos + i) >= len \|\| chars[pos + i] != chars[quoteStart + i]) {
849	return false;
850	}
851	}
852	return true;
853	}
854
855	// Delimiter
856	//-----------------------------------------------------------------------
857	/**
858	* Gets the field delimiter matcher.
859	*
860	* @return the delimiter matcher in use
861	*/
862	public StrMatcher getDelimiterMatcher() {
863	return this.delimMatcher;
864	}
865
866	/**
867	* Sets the field delimiter matcher.
868	* <p>
869	* The delimitier is used to separate one token from another.
870	*
871	* @param delim the delimiter matcher to use
872	* @return this, to enable chaining
873	*/
874	public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
875	if (delim == null) {
876	this.delimMatcher = StrMatcher.noneMatcher();
877	} else {
878	this.delimMatcher = delim;
879	}
880	return this;
881	}
882
883	/**
884	* Sets the field delimiter character.
885	*
886	* @param delim the delimiter character to use
887	* @return this, to enable chaining
888	*/
889	public StrTokenizer setDelimiterChar(char delim) {
890	return setDelimiterMatcher(StrMatcher.charMatcher(delim));
891	}
892
893	/**
894	* Sets the field delimiter string.
895	*
896	* @param delim the delimiter string to use
897	* @return this, to enable chaining
898	*/
899	public StrTokenizer setDelimiterString(String delim) {
900	return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
901	}
902
903	// Quote
904	//-----------------------------------------------------------------------
905	/**
906	* Gets the quote matcher currently in use.
907	* <p>
908	* The quote character is used to wrap data between the tokens.
909	* This enables delimiters to be entered as data.
910	* The default value is '"' (double quote).
911	*
912	* @return the quote matcher in use
913	*/
914	public StrMatcher getQuoteMatcher() {
915	return quoteMatcher;
916	}
917
918	/**
919	* Set the quote matcher to use.
920	* <p>
921	* The quote character is used to wrap data between the tokens.
922	* This enables delimiters to be entered as data.
923	*
924	* @param quote the quote matcher to use, null ignored
925	* @return this, to enable chaining
926	*/
927	public StrTokenizer setQuoteMatcher(StrMatcher quote) {
928	if (quote != null) {
929	this.quoteMatcher = quote;
930	}
931	return this;
932	}
933
934	/**
935	* Sets the quote character to use.
936	* <p>
937	* The quote character is used to wrap data between the tokens.
938	* This enables delimiters to be entered as data.
939	*
940	* @param quote the quote character to use
941	* @return this, to enable chaining
942	*/
943	public StrTokenizer setQuoteChar(char quote) {
944	return setQuoteMatcher(StrMatcher.charMatcher(quote));
945	}
946
947	// Ignored
948	//-----------------------------------------------------------------------
949	/**
950	* Gets the ignored character matcher.
951	* <p>
952	* These characters are ignored when parsing the String, unless they are
953	* within a quoted region.
954	* The default value is not to ignore anything.
955	*
956	* @return the ignored matcher in use
957	*/
958	public StrMatcher getIgnoredMatcher() {
959	return ignoredMatcher;
960	}
961
962	/**
963	* Set the matcher for characters to ignore.
964	* <p>
965	* These characters are ignored when parsing the String, unless they are
966	* within a quoted region.
967	*
968	* @param ignored the ignored matcher to use, null ignored
969	* @return this, to enable chaining
970	*/
971	public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
972	if (ignored != null) {
973	this.ignoredMatcher = ignored;
974	}
975	return this;
976	}
977
978	/**
979	* Set the character to ignore.
980	* <p>
981	* This character is ignored when parsing the String, unless it is
982	* within a quoted region.
983	*
984	* @param ignored the ignored character to use
985	* @return this, to enable chaining
986	*/
987	public StrTokenizer setIgnoredChar(char ignored) {
988	return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
989	}
990
991	// Trimmer
992	//-----------------------------------------------------------------------
993	/**
994	* Gets the trimmer character matcher.
995	* <p>
996	* These characters are trimmed off on each side of the delimiter
997	* until the token or quote is found.
998	* The default value is not to trim anything.
999	*
1000	* @return the trimmer matcher in use
1001	*/
1002	public StrMatcher getTrimmerMatcher() {
1003	return trimmerMatcher;
1004	}
1005
1006	/**
1007	* Sets the matcher for characters to trim.
1008	* <p>
1009	* These characters are trimmed off on each side of the delimiter
1010	* until the token or quote is found.
1011	*
1012	* @param trimmer the trimmer matcher to use, null ignored
1013	* @return this, to enable chaining
1014	*/
1015	public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
1016	if (trimmer != null) {
1017	this.trimmerMatcher = trimmer;
1018	}
1019	return this;
1020	}
1021
1022	//-----------------------------------------------------------------------
1023	/**
1024	* Gets whether the tokenizer currently returns empty tokens as null.
1025	* The default for this property is false.
1026	*
1027	* @return true if empty tokens are returned as null
1028	*/
1029	public boolean isEmptyTokenAsNull() {
1030	return this.emptyAsNull;
1031	}
1032
1033	/**
1034	* Sets whether the tokenizer should return empty tokens as null.
1035	* The default for this property is false.
1036	*
1037	* @param emptyAsNull whether empty tokens are returned as null
1038	* @return this, to enable chaining
1039	*/
1040	public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1041	this.emptyAsNull = emptyAsNull;
1042	return this;
1043	}
1044
1045	//-----------------------------------------------------------------------
1046	/**
1047	* Gets whether the tokenizer currently ignores empty tokens.
1048	* The default for this property is true.
1049	*
1050	* @return true if empty tokens are not returned
1051	*/
1052	public boolean isIgnoreEmptyTokens() {
1053	return ignoreEmptyTokens;
1054	}
1055
1056	/**
1057	* Sets whether the tokenizer should ignore and not return empty tokens.
1058	* The default for this property is true.
1059	*
1060	* @param ignoreEmptyTokens whether empty tokens are not returned
1061	* @return this, to enable chaining
1062	*/
1063	public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1064	this.ignoreEmptyTokens = ignoreEmptyTokens;
1065	return this;
1066	}
1067
1068	//-----------------------------------------------------------------------
1069	/**
1070	* Gets the String content that the tokenizer is parsing.
1071	*
1072	* @return the string content being parsed
1073	*/
1074	public String getContent() {
1075	if (chars == null) {
1076	return null;
1077	}
1078	return new String(chars);
1079	}
1080
1081	//-----------------------------------------------------------------------
1082	/**
1083	* Creates a new instance of this Tokenizer. The new instance is reset so
1084	* that it will be at the start of the token list.
1085	* If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1086	*
1087	* @return a new instance of this Tokenizer which has been reset.
1088	*/
1089	public Object clone() {
1090	try {
1091	return cloneReset();
1092	} catch (CloneNotSupportedException ex) {
1093	return null;
1094	}
1095	}
1096
1097	/**
1098	* Creates a new instance of this Tokenizer. The new instance is reset so that
1099	* it will be at the start of the token list.
1100	*
1101	* @return a new instance of this Tokenizer which has been reset.
1102	* @throws CloneNotSupportedException if there is a problem cloning
1103	*/
1104	Object cloneReset() throws CloneNotSupportedException {
1105	// this method exists to enable 100% test coverage
1106	StrTokenizer cloned = (StrTokenizer) super.clone();
1107	if (cloned.chars != null) {
1108	cloned.chars = (char[]) cloned.chars.clone();
1109	}
1110	cloned.reset();
1111	return cloned;
1112	}
1113
1114	//-----------------------------------------------------------------------
1115	/**
1116	* Gets the String content that the tokenizer is parsing.
1117	*
1118	* @return the string content being parsed
1119	*/
1120	public String toString() {
1121	if (tokens == null) {
1122	return "StrTokenizer[not tokenized yet]";
1123	}
1124	return "StrTokenizer" + getTokenList();
1125	}
1126
1127	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: src/main/java/agents/org/apache/commons/lang/text/StrTokenizer.java

Download in other formats: