diff --git a/hutool-http/src/main/java/cn/hutool/http/HtmlUtil.java b/hutool-http/src/main/java/cn/hutool/http/HtmlUtil.java
index 74a1433f7..5224390ff 100644
--- a/hutool-http/src/main/java/cn/hutool/http/HtmlUtil.java
+++ b/hutool-http/src/main/java/cn/hutool/http/HtmlUtil.java
@@ -7,11 +7,15 @@ import cn.hutool.core.util.StrUtil;
/**
* HTML工具类
*
+ *
+ * 比如我们在使用爬虫爬取HTML页面后,需要对返回页面的HTML内容做一定处理,
+ * 比如去掉指定标签(例如广告栏等)、去除JS、去掉样式等等,这些操作都可以使用此工具类完成。
+ *
* @author xiaoleilu
*
*/
public class HtmlUtil {
-
+
public static final String NBSP = StrUtil.HTML_NBSP;
public static final String AMP = StrUtil.HTML_AMP;
public static final String QUOTE = StrUtil.HTML_QUOTE;
@@ -36,12 +40,12 @@ public class HtmlUtil {
TEXT['<'] = LT.toCharArray(); // 小于号
TEXT['>'] = GT.toCharArray(); // 大于号
}
-
+
/**
* 转义文本中的HTML字符为安全的字符,以下字符被转义:
*
- * - ' 替换为 ' (' doesn't work in HTML4)
- * - " 替换为 "
+ * - ' 替换为 ' (' doesn't work in HTML4)
+ * - " 替换为 "
* - & 替换为 &
* - < 替换为 <
* - > 替换为 >
@@ -64,14 +68,14 @@ public class HtmlUtil {
if (StrUtil.isBlank(htmlStr)) {
return htmlStr;
}
-
+
return EscapeUtil.unescapeHtml4(htmlStr);
}
// ---------------------------------------------------------------- encode text
/**
- * 清除所有HTML标签
+ * 清除所有HTML标签,但是不删除标签内的内容
*
* @param content 文本
* @return 清除标签后的文本
@@ -135,7 +139,7 @@ public class HtmlUtil {
}
/**
- * 去除HTML标签中的属性
+ * 去除HTML标签中的属性,如果多个标签有相同属性,都去除
*
* @param content 文本
* @param attrs 属性名(不区分大小写)
@@ -144,6 +148,7 @@ public class HtmlUtil {
public static String removeHtmlAttr(String content, String... attrs) {
String regex = null;
for (String attr : attrs) {
+ // (?i)表示忽略大小写
regex = StrUtil.format("(?i)\\s*{}=([\"']).*?\\1", attr);
content = content.replaceAll(regex, StrUtil.EMPTY);
}
diff --git a/hutool-http/src/test/java/cn/hutool/http/test/HtmlUtilTest.java b/hutool-http/src/test/java/cn/hutool/http/test/HtmlUtilTest.java
index 8175beb0c..bc9ec6bc4 100644
--- a/hutool-http/src/test/java/cn/hutool/http/test/HtmlUtilTest.java
+++ b/hutool-http/src/test/java/cn/hutool/http/test/HtmlUtilTest.java
@@ -46,6 +46,39 @@ public class HtmlUtilTest {
Assert.assertEquals("pre", result);
}
+ @Test
+ public void cleanHtmlTagTest() {
+ //非闭合标签
+ String str = "pre
";
+ String result = HtmlUtil.cleanHtmlTag(str);
+ Assert.assertEquals("pre", result);
+
+ //闭合标签
+ str = "pre
";
+ result = HtmlUtil.cleanHtmlTag(str);
+ Assert.assertEquals("pre", result);
+
+ //闭合标签
+ str = "pre
";
+ result = HtmlUtil.cleanHtmlTag(str);
+ Assert.assertEquals("pre", result);
+
+ //闭合标签
+ str = "pre
";
+ result = HtmlUtil.cleanHtmlTag(str);
+ Assert.assertEquals("pre", result);
+
+ //包含内容标签
+ str = "predfdsfdsfdsf
";
+ result = HtmlUtil.cleanHtmlTag(str);
+ Assert.assertEquals("predfdsfdsfdsf", result);
+
+ //带换行
+ str = "pre\r\n\t\tdfdsfdsfdsf\r\n
BBBB
";
+ result = HtmlUtil.cleanHtmlTag(str);
+ Assert.assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result);
+ }
+
@Test
public void unwrapHtmlTagTest() {
//非闭合标签
@@ -83,6 +116,7 @@ public class HtmlUtilTest {
public void escapeTest() {
String html = "123'123'";
String escape = HtmlUtil.escape(html);
+ Assert.assertEquals("<html><body>123'123'</body></html>", escape);
String restoreEscaped = HtmlUtil.unescape(escape);
Assert.assertEquals(html, restoreEscaped);
}
@@ -93,4 +127,18 @@ public class HtmlUtilTest {
String filter = HtmlUtil.filter(html);
Assert.assertEquals("", filter);
}
+
+ @Test
+ public void removeHtmlAttrTest() {
+ String html = "";
+ String result = HtmlUtil.removeHtmlAttr(html, "class");
+ Assert.assertEquals("", result);
+ }
+
+ @Test
+ public void removeAllHtmlAttrTest() {
+ String html = "";
+ String result = HtmlUtil.removeAllHtmlAttr(html, "div");
+ Assert.assertEquals("", result);
+ }
}