def
remove_js_css (content):
""" remove the the javascript and the stylesheet and the comment content (<script>....</script> and <style>....</style> <!-- xxx -->) """
r =
re. compile (r ‘‘‘<script.*?</script>‘‘‘ ,re.I|re.M|re.S)
s =
r.sub (‘‘,content)
r =
re. compile (r ‘‘‘<style.*?</style>‘‘‘ ,re.I|re.M|re.S)
s =
r.sub (‘‘, s)
r =
re. compile (r ‘‘‘<!--.*?-->‘‘‘ , re.I|re.M|re.S)
s =
r.sub(‘‘,s)
r =
re. compile (r ‘‘‘<meta.*?>‘‘‘ , re.I|re.M|re.S)
s =
r.sub(‘‘,s)
r =
re. compile (r ‘‘‘<ins.*?</ins>‘‘‘ , re.I|re.M|re.S)
s =
r.sub(‘‘,s)
return
s
|
- 去除空行
def
remove_empty_line (content):
"""remove multi space """
r =
re. compile (r ‘‘‘^\s+$‘‘‘ , re.M|re.S)
s =
r.sub (‘‘, content)
r =
re. compile (r ‘‘‘\n+‘‘‘ ,re.M|re.S)
s =
r.sub( ‘\n‘ ,s)
return
s
|
def
remove_any_tag (s):
s =
re.sub(r ‘‘‘<[^>]+>‘‘‘ ,‘‘,s)
return
s.strip()
def
remove_any_tag_but_a (s):
text =
re.findall (r ‘‘‘<a[^r][^>]*>(.*?)</a>‘‘‘ ,s,re.I|re.S|re.S)
text_b =
remove_any_tag (s)
return
len (‘‘.join(text)), len (text_b)
def
remove_image (s,n = 50 ):
image =
‘a‘
*
n
r =
re. compile
(r ‘‘‘<img.*?>‘‘‘ ,re.I|re.M|re.S)
s =
r.sub(image,s)
return
s
def
remove_video (s,n = 1000 ):
video =
‘a‘
*
n
r =
re. compile
(r ‘‘‘<embed.*?>‘‘‘ ,re.I|re.M|re.S)
s =
r.sub(video,s)
return
s
|