python 格式化html + js/jq 格式化html

192 阅读 0 评论 127 点赞

我是靠谱客的博主碧蓝星月，这篇文章主要介绍python 格式化html + js/jq 格式化html，现在分享给大家，希望可以做个参考。

格式化用户输入（尤其是粘贴），成标准文章格式（可以自己定制，修改最后的css样式）

在用户粘贴文章进来的时候，自己的文本编辑框有问题，因为复制网页上的东西，大部分都是带有格式的。而且各种恶心的自定义标签，加大了难度。

话不多说，代码有啥问题，可以交流，重点是后端的python代码。

1、前端获取粘贴之后的数据（因为环境不一样，安卓获取不到粘贴板），然后先简单的去除下格式

v3ui_richdiv 是我们自己插件

{# paste事件中添加清除格式，然后再进行覆盖原来粘贴的内容，避免各种各样不规则文本#}

document.querySelector(".v3ui_richdiv").addEventListener("paste", function(e) {
setTimeout(function() {
$('.v3ui_richdiv' ).v3ui_richdiv('html' , filter_html());
document.execCommand("selectAll");
document.execCommand("removeFormat");
//很奇怪，有时候remove一次，会有遗漏，再来一次

document.execCommand("removeFormat");
document.execCommand("unlink");
document.execCommand("justifyLeft");
document.execCommand("unselect");
$('.v3ui_richdiv').find('*').removeAttr('class').removeAttr('style');
//把光标移到文本末尾

msgTextLastPos(document.querySelector(".v3ui_richdiv"));
}, 0);
});

//移动光标至文本框末尾
function msgTextLastPos(obj) {
// 解决浏览器的兼容问题，做如下条件判断

if (window.getSelection) {
obj.focus();
var range = window.getSelection();
range.selectAllChildren(obj);
range.collapseToEnd();//光标移至最后

}
else if (document.selection) {
var range = document.selection.createRange();
range.moveToElementText(obj);
range.collapse(false);//光标移至最后

range.select();
}
}

2、这段python代码是核心，去除格式，拿到数据，然后添加文章样式。

def filter_html(ct_comment):
def replace_unvisiable(content):
for one in [u'u3000', u'u0020', u'u0009', u'u000b',u'u000c',u'u00a0', u'u000a',u'u000d',u'u2028', u'u2029',u'u200f',u'u200e',u'u200d',u'ufeff']:
content = content.replace(one, '')
else:
return content.strip()
ct_comment = smart_decode(ct_comment)
for tag in [ '&nbsp;','&quot;', '&ensp;', '&emsp;', '&thinsp;', '&zwnj;', '&zwj;']:
ct_comment = ct_comment.replace(tag, '')
# for tag in [ '<br/>', '<br>', '<br/>', '<br/>', ]:

#
ct_comment = ct_comment.replace(tag, '')


ct_comment = pq(ct_comment)
# 去除某些不要的标签

ct_comment('img').remove()
ct_comment('video').remove()
ct_comment('style').remove()
ct_comment('script').remove()
# 批量 去除常用属性

all_tags = ct_comment('*')
p_div = ['']
for one_tag in all_tags:
# print one_tag.tag, "ttt", one_tag.text, "ttt", one_tag.tail

if one_tag.tag in ['p', 'div', 'br']:
if one_tag.text:
p_div.append(one_tag.text)
if one_tag.tail:
p_div.append(one_tag.tail)
if not one_tag.text and not one_tag.tail:
p_div.append('')
elif one_tag.tag in ['tr']:
p_div.append('')
else:
if one_tag.text:
p_div[len(p_div)-1] += one_tag.text
if one_tag.tail:
p_div[len(p_div)-1] += one_tag.tail
p_div = [replace_unvisiable(one) for one in p_div if one]
ct_comment = ''.join(["<p class='_style'>"+ one.strip() + "</p>" for one in p_div if one])
# for one in p_div :

#
if one:

#
print one.strip()

# else:

#
print '*'*250


# 添加属性

if '<style type="text/css">._style{text-indent: 2em;font-size:17px;line-height:28px;font-family:"Microsoft YaHei"}</style>' in ct_comment:
ct_comment.replace('<style type="text/css">._style{text-indent: 2em;font-size:17px;line-height:28px;font-family:"Microsoft YaHei"}</style>', '')
ct_comment += '<style type="text/css">._style{text-indent: 2em;font-size:17px;line-height:28px;font-family:"Microsoft YaHei"}</style> '


return ct_comment