diff --git a/hutool-http/src/main/java/org/dromara/hutool/http/html/HtmlUtil.java b/hutool-http/src/main/java/org/dromara/hutool/http/html/HtmlUtil.java
index 998bcd774..6cffc6e67 100644
--- a/hutool-http/src/main/java/org/dromara/hutool/http/html/HtmlUtil.java
+++ b/hutool-http/src/main/java/org/dromara/hutool/http/html/HtmlUtil.java
@@ -41,6 +41,10 @@ public class HtmlUtil {
* HTML标签正则
*/
public static final Pattern RE_HTML_MARK = Pattern.compile("(<[^<]*?>)|(<\\s*?/[^<]*?>)|(<[^<]*?/\\s*?>)", Pattern.CASE_INSENSITIVE);
+ /**
+ * 正则:匹配空标签
+ */
+ public static final String RE_HTML_EMPTY_MARK = "<(\\w+)([^>]*)>\\s*\\1>";
/**
* script标签正则
*/
@@ -111,6 +115,17 @@ public class HtmlUtil {
return ReUtil.replaceAll(content, RE_HTML_MARK, StrUtil.EMPTY);
}
+ /**
+ * 清除所有HTML空标签
+ * 例如:{@code
}
+ *
+ * @param content 文本
+ * @return 清除空标签后的文本
+ */
+ public static String cleanEmptyTag(final String content) {
+ return content.replaceAll(RE_HTML_EMPTY_MARK, StrUtil.EMPTY);
+ }
+
/**
* 清除所有script标签,包括内容
*
diff --git a/hutool-http/src/test/java/org/dromara/hutool/http/html/HtmlUtilTest.java b/hutool-http/src/test/java/org/dromara/hutool/http/html/HtmlUtilTest.java
index 569c37e77..c0978a56e 100644
--- a/hutool-http/src/test/java/org/dromara/hutool/http/html/HtmlUtilTest.java
+++ b/hutool-http/src/test/java/org/dromara/hutool/http/html/HtmlUtilTest.java
@@ -18,9 +18,10 @@ package org.dromara.hutool.http.html;
import org.dromara.hutool.core.regex.ReUtil;
import org.dromara.hutool.http.meta.ContentTypeUtil;
-import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
/**
* Html单元测试
*
@@ -34,32 +35,32 @@ public class HtmlUtilTest {
//非闭合标签
String str = "pre
";
String result = HtmlUtil.removeHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.removeHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.removeHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.removeHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//包含内容标签
str = "predfdsfdsfdsf
";
result = HtmlUtil.removeHtmlTag(str, "div");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//带换行
str = "pre\r\n\t\tdfdsfdsfdsf\r\n
";
result = HtmlUtil.removeHtmlTag(str, "div");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
}
@Test
@@ -67,32 +68,32 @@ public class HtmlUtilTest {
//非闭合标签
String str = "pre
";
String result = HtmlUtil.cleanHtmlTag(str);
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.cleanHtmlTag(str);
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.cleanHtmlTag(str);
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.cleanHtmlTag(str);
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//包含内容标签
str = "predfdsfdsfdsf
";
result = HtmlUtil.cleanHtmlTag(str);
- Assertions.assertEquals("predfdsfdsfdsf", result);
+ assertEquals("predfdsfdsfdsf", result);
//带换行
str = "pre\r\n\t\tdfdsfdsfdsf\r\n
BBBB
";
result = HtmlUtil.cleanHtmlTag(str);
- Assertions.assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result);
+ assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result);
}
@Test
@@ -100,37 +101,37 @@ public class HtmlUtilTest {
//非闭合标签
String str = "pre
";
String result = HtmlUtil.unwrapHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.unwrapHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.unwrapHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.unwrapHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//闭合标签
str = "pre
";
result = HtmlUtil.unwrapHtmlTag(str, "img");
- Assertions.assertEquals("pre", result);
+ assertEquals("pre", result);
//包含内容标签
str = "preabc
";
result = HtmlUtil.unwrapHtmlTag(str, "div");
- Assertions.assertEquals("preabc", result);
+ assertEquals("preabc", result);
//带换行
str = "pre\r\n\t\tabc\r\n
";
result = HtmlUtil.unwrapHtmlTag(str, "div");
- Assertions.assertEquals("pre\r\n\t\tabc\r\n", result);
+ assertEquals("pre\r\n\t\tabc\r\n", result);
}
@Test
@@ -139,34 +140,34 @@ public class HtmlUtilTest {
final String htmlString = "
测试文本";
final String tagString = "i,br";
final String cleanTxt = HtmlUtil.removeHtmlTag(htmlString, false, tagString.split(","));
- Assertions.assertEquals("
测试文本", cleanTxt);
+ assertEquals("
测试文本", cleanTxt);
}
@Test
public void escapeTest() {
final String html = "123'123'";
final String escape = HtmlUtil.escape(html);
- Assertions.assertEquals("<html><body>123'123'</body></html>", escape);
+ assertEquals("<html><body>123'123'</body></html>", escape);
final String restoreEscaped = HtmlUtil.unescape(escape);
- Assertions.assertEquals(html, restoreEscaped);
- Assertions.assertEquals("'", HtmlUtil.unescape("'"));
+ assertEquals(html, restoreEscaped);
+ assertEquals("'", HtmlUtil.unescape("'"));
}
@Test
public void escapeTest2() {
final char c = ' '; // 不断开空格(non-breaking space,缩写nbsp。)
- Assertions.assertEquals(c, 160);
+ assertEquals(c, 160);
final String html = " ";
final String escape = HtmlUtil.escape(html);
- Assertions.assertEquals("<html><body> </body></html>", escape);
- Assertions.assertEquals(" ", HtmlUtil.unescape(" "));
+ assertEquals("<html><body> </body></html>", escape);
+ assertEquals(" ", HtmlUtil.unescape(" "));
}
@Test
public void filterTest() {
final String html = "";
final String filter = HtmlUtil.filter(html);
- Assertions.assertEquals("", filter);
+ assertEquals("", filter);
}
@Test
@@ -175,54 +176,77 @@ public class HtmlUtilTest {
// 去除的属性加双引号测试
String html = "";
String result = HtmlUtil.removeHtmlAttr(html, "class");
- Assertions.assertEquals("", result);
+ assertEquals("", result);
// 去除的属性后跟空格、加单引号、不加引号测试
html = "";
result = HtmlUtil.removeHtmlAttr(html, "class");
- Assertions.assertEquals("", result);
+ assertEquals("", result);
// 去除的属性位于标签末尾、其它属性前测试
html = "";
result = HtmlUtil.removeHtmlAttr(html, "class");
- Assertions.assertEquals("", result);
+ assertEquals("", result);
// 去除的属性名和值之间存在空格
html = "";
result = HtmlUtil.removeHtmlAttr(html, "class");
- Assertions.assertEquals("", result);
+ assertEquals("", result);
}
@Test
public void removeAllHtmlAttrTest() {
final String html = "";
final String result = HtmlUtil.removeAllHtmlAttr(html, "div");
- Assertions.assertEquals("", result);
+ assertEquals("", result);
}
@Test
public void getCharsetTest() {
String charsetName = ReUtil.get(ContentTypeUtil.CHARSET_PATTERN, "Charset=UTF-8;fq=0.9", 1);
- Assertions.assertEquals("UTF-8", charsetName);
+ assertEquals("UTF-8", charsetName);
charsetName = ReUtil.get(HtmlUtil.META_CHARSET_PATTERN, "hello world