处理html文本，保留指定标签、属性

from html import parser
from lxml.html.clean import Cleaner

# 保留标签的src、href属性
safe_attrs =['src', 'href']
# 保留的标签
allow_tags = ["div", "p", "img", "video", "h1","h2","h3","h4","h5", "h6", "br", "a", "blockquote"]

cleaner = Cleaner(safe_attrs=safe_attrs, allow_tags=allow_tags, remove_unknown_tags=False)
clean_content = cleaner.clean_html(content)

# 处理后的标签内的url会被编码，进行解码
clean_content = parser.unescape(clean_content)

原文链接：https://blog.csdn.net/yezi1993/article/details/109669509