mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-07-21 15:09:48 +08:00
修复CsvParser中对正文中双引号处理逻辑问题
This commit is contained in:
@@ -2645,7 +2645,7 @@ public class CharSequenceUtil extends StrValidator {
|
|||||||
if (isEmpty(str)) {
|
if (isEmpty(str)) {
|
||||||
return toStringOrNull(str);
|
return toStringOrNull(str);
|
||||||
}
|
}
|
||||||
if (str.charAt(0) == prefix && str.charAt(str.length() - 1) == suffix) {
|
if (isWrap(str, prefix, suffix)) {
|
||||||
return sub(str, 1, str.length() - 1);
|
return sub(str, 1, str.length() - 1);
|
||||||
}
|
}
|
||||||
return str.toString();
|
return str.toString();
|
||||||
|
@@ -92,8 +92,8 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
/**
|
/**
|
||||||
* CSV解析器
|
* CSV解析器
|
||||||
*
|
*
|
||||||
* @param reader Reader
|
* @param reader Reader
|
||||||
* @param config 配置,null则为默认配置
|
* @param config 配置,null则为默认配置
|
||||||
* @param bufferSize 默认缓存大小
|
* @param bufferSize 默认缓存大小
|
||||||
*/
|
*/
|
||||||
public CsvParser(final Reader reader, final CsvReadConfig config, final int bufferSize) {
|
public CsvParser(final Reader reader, final CsvReadConfig config, final int bufferSize) {
|
||||||
@@ -109,7 +109,7 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
* @throws IllegalStateException 如果不解析头部或者没有调用nextRow()方法
|
* @throws IllegalStateException 如果不解析头部或者没有调用nextRow()方法
|
||||||
*/
|
*/
|
||||||
public List<String> getHeader() {
|
public List<String> getHeader() {
|
||||||
if (config.headerLineNo < 0) {
|
if (config.headerLineNo < 0) {
|
||||||
throw new IllegalStateException("No header available - header parsing is disabled");
|
throw new IllegalStateException("No header available - header parsing is disabled");
|
||||||
}
|
}
|
||||||
if (lineNo < config.beginLineNo) {
|
if (lineNo < config.beginLineNo) {
|
||||||
@@ -141,11 +141,11 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 读取范围校验
|
// 读取范围校验
|
||||||
if(lineNo < config.beginLineNo){
|
if (lineNo < config.beginLineNo) {
|
||||||
// 未达到读取起始行,继续
|
// 未达到读取起始行,继续
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(lineNo > config.endLineNo){
|
if (lineNo > config.endLineNo) {
|
||||||
// 超出结束行,读取结束
|
// 超出结束行,读取结束
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -209,7 +209,7 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
* 空行是size为1的List,唯一元素是""
|
* 空行是size为1的List,唯一元素是""
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* 行号要考虑注释行和引号包装的内容中的换行
|
* 行号要考虑注释行和引号包装的内容中的换行
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @return 一行数据
|
* @return 一行数据
|
||||||
@@ -218,7 +218,7 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
private List<String> readLine() throws IORuntimeException {
|
private List<String> readLine() throws IORuntimeException {
|
||||||
// 矫正行号
|
// 矫正行号
|
||||||
// 当一行内容包含多行数据时,记录首行行号,但是读取下一行时,需要把多行内容的行数加上
|
// 当一行内容包含多行数据时,记录首行行号,但是读取下一行时,需要把多行内容的行数加上
|
||||||
if(inQuotesLineCount > 0){
|
if (inQuotesLineCount > 0) {
|
||||||
this.lineNo += this.inQuotesLineCount;
|
this.lineNo += this.inQuotesLineCount;
|
||||||
this.inQuotesLineCount = 0;
|
this.inQuotesLineCount = 0;
|
||||||
}
|
}
|
||||||
@@ -257,16 +257,16 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
final char c = buf.get();
|
final char c = buf.get();
|
||||||
|
|
||||||
// 注释行标记
|
// 注释行标记
|
||||||
if(preChar < 0 || preChar == CharUtil.CR || preChar == CharUtil.LF){
|
if (preChar < 0 || preChar == CharUtil.CR || preChar == CharUtil.LF) {
|
||||||
// 判断行首字符为指定注释字符的注释开始,直到遇到换行符
|
// 判断行首字符为指定注释字符的注释开始,直到遇到换行符
|
||||||
// 行首分两种,1是preChar < 0表示文本开始,2是换行符后紧跟就是下一行的开始
|
// 行首分两种,1是preChar < 0表示文本开始,2是换行符后紧跟就是下一行的开始
|
||||||
// issue#IA8WE0 如果注释符出现在包装符内,被认为是普通字符
|
// issue#IA8WE0 如果注释符出现在包装符内,被认为是普通字符
|
||||||
if(!inQuotes && null != this.config.commentCharacter && c == this.config.commentCharacter){
|
if (!inQuotes && null != this.config.commentCharacter && c == this.config.commentCharacter) {
|
||||||
inComment = true;
|
inComment = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 注释行处理
|
// 注释行处理
|
||||||
if(inComment){
|
if (inComment) {
|
||||||
if (c == CharUtil.CR || c == CharUtil.LF) {
|
if (c == CharUtil.CR || c == CharUtil.LF) {
|
||||||
// 注释行以换行符为结尾
|
// 注释行以换行符为结尾
|
||||||
lineNo++;
|
lineNo++;
|
||||||
@@ -302,8 +302,8 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
buf.mark();
|
buf.mark();
|
||||||
addField(currentFields, currentField.toString());
|
addField(currentFields, currentField.toString());
|
||||||
currentField.setLength(0);
|
currentField.setLength(0);
|
||||||
} else if (c == config.textDelimiter) {
|
} else if (c == config.textDelimiter && isFieldBegin(preChar)) {
|
||||||
// 引号开始
|
// 引号开始且出现在字段开头
|
||||||
inQuotes = true;
|
inQuotes = true;
|
||||||
copyLen++;
|
copyLen++;
|
||||||
} else if (c == CharUtil.CR) {
|
} else if (c == CharUtil.CR) {
|
||||||
@@ -361,11 +361,15 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
final char textDelimiter = this.config.textDelimiter;
|
final char textDelimiter = this.config.textDelimiter;
|
||||||
|
|
||||||
// 忽略多余引号后的换行符
|
// 忽略多余引号后的换行符
|
||||||
field = StrUtil.trim(field, StrTrimer.TrimMode.SUFFIX, (c-> c == CharUtil.LF || c == CharUtil.CR));
|
field = StrUtil.trim(field, StrTrimer.TrimMode.SUFFIX, (c -> c == CharUtil.LF || c == CharUtil.CR));
|
||||||
|
|
||||||
field = StrUtil.unWrap(field, textDelimiter);
|
if(StrUtil.isWrap(field, textDelimiter)){
|
||||||
field = StrUtil.replace(field, String.valueOf(textDelimiter) + textDelimiter, String.valueOf(textDelimiter));
|
field = StrUtil.sub(field, 1, field.length() - 1);
|
||||||
if(this.config.trimField){
|
// https://datatracker.ietf.org/doc/html/rfc4180#section-2
|
||||||
|
// 第七条规则,只有包装内的包装符需要转义
|
||||||
|
field = StrUtil.replace(field, String.valueOf(textDelimiter) + textDelimiter, String.valueOf(textDelimiter));
|
||||||
|
}
|
||||||
|
if (this.config.trimField) {
|
||||||
// issue#I49M0C@Gitee
|
// issue#I49M0C@Gitee
|
||||||
field = StrUtil.trim(field);
|
field = StrUtil.trim(field);
|
||||||
}
|
}
|
||||||
@@ -384,12 +388,30 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S
|
|||||||
return (c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR;
|
return (c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 通过前一个字符,判断是否字段开始,几种情况:
|
||||||
|
* <ul>
|
||||||
|
* <li>正文开头,无前字符</li>
|
||||||
|
* <li>字段分隔符,即上个字段结束</li>
|
||||||
|
* <li>换行符,即新行开始</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* @param preChar 前字符
|
||||||
|
* @return 是否字段开始
|
||||||
|
*/
|
||||||
|
private boolean isFieldBegin(final int preChar) {
|
||||||
|
return preChar == -1
|
||||||
|
|| preChar == config.fieldSeparator
|
||||||
|
|| preChar == CharUtil.LF
|
||||||
|
|| preChar == CharUtil.CR;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 内部Buffer
|
* 内部Buffer
|
||||||
*
|
*
|
||||||
* @author looly
|
* @author looly
|
||||||
*/
|
*/
|
||||||
private static class Buffer implements Serializable{
|
private static class Buffer implements Serializable {
|
||||||
private static final long serialVersionUID = 1L;
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
final char[] buf;
|
final char[] buf;
|
||||||
|
@@ -8,9 +8,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 按照 https://datatracker.ietf.org/doc/html/rfc4180#section-2<br>
|
* 按照 https://datatracker.ietf.org/doc/html/rfc4180#section-2<br>
|
||||||
* 如果字段正文中出现双引号,需要使用两个双引号表示转义
|
* 如果字段正文中出现双引号,需要使用两个双引号表示转义,并整段使用引号包裹
|
||||||
*/
|
*/
|
||||||
public class Pr1244Test {
|
public class Pr1244Test {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 此测试中没有引号包裹,则所有引号都被当作内容
|
||||||
|
*/
|
||||||
@Test
|
@Test
|
||||||
void csvReadTest() {
|
void csvReadTest() {
|
||||||
final String csv = "a,q\"\"e,d,f";
|
final String csv = "a,q\"\"e,d,f";
|
||||||
@@ -18,6 +22,21 @@ public class Pr1244Test {
|
|||||||
final CsvData read = reader.read();
|
final CsvData read = reader.read();
|
||||||
assertEquals(4, read.getRow(0).size());
|
assertEquals(4, read.getRow(0).size());
|
||||||
assertEquals("a", read.getRow(0).get(0));
|
assertEquals("a", read.getRow(0).get(0));
|
||||||
|
assertEquals("q\"\"e", read.getRow(0).get(1));
|
||||||
|
assertEquals("d", read.getRow(0).get(2));
|
||||||
|
assertEquals("f", read.getRow(0).get(3));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 此测试中没有引号包裹,则所有引号都被当作内容
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
void csvReadTest2() {
|
||||||
|
final String csv = "a,q\"e,d,f";
|
||||||
|
final CsvReader reader = CsvUtil.getReader(new StringReader(csv));
|
||||||
|
final CsvData read = reader.read();
|
||||||
|
assertEquals(4, read.getRow(0).size());
|
||||||
|
assertEquals("a", read.getRow(0).get(0));
|
||||||
assertEquals("q\"e", read.getRow(0).get(1));
|
assertEquals("q\"e", read.getRow(0).get(1));
|
||||||
assertEquals("d", read.getRow(0).get(2));
|
assertEquals("d", read.getRow(0).get(2));
|
||||||
assertEquals("f", read.getRow(0).get(3));
|
assertEquals("f", read.getRow(0).get(3));
|
||||||
|
Reference in New Issue
Block a user