处理html文本,保留指定标签、属性

from html import parser
from lxml.html.clean import Cleaner

# 保留标签的src、href属性
safe_attrs =['src', 'href']
# 保留的标签
allow_tags = ["div", "p", "img", "video", "h1","h2","h3","h4","h5", "h6", "br", "a", "blockquote"]

cleaner = Cleaner(safe_attrs=safe_attrs, allow_tags=allow_tags, remove_unknown_tags=False)
clean_content = cleaner.clean_html(content)

# 处理后的标签内的url会被编码,进行解码
clean_content = parser.unescape(clean_content)

版权声明:本文为yezi1993原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。