@bergus
2015-11-08T22:24:31.000000Z
字数 3825
阅读 2726
python
mht
网络链接
mime
import base64
import email
import email.message
import mimetypes
import os
import quopri
import sys
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse
class MHTHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
if not tag in ['link','script']:
return
attrs = dict(attrs)
if 'src' in attrs.keys():
a = attrs.get('src')
if a and a.find('google') == -1:
self.urls.append(a)
elif 'stylesheet' in attrs.values():
self.urls.append(attrs.get('href'))
class URL2MHT(object):
def __init__(self,url):
self.domain = url.split(urlparse(url).path)[0]
self.url = url
def _head(self):
a = email.message.Message()
a["MIME-Version"] = "1.0"
a["X-UnMHT-Save-State"] = "Current-State"
a.add_header("Content-Type",
"multipart/related",
type="text/html",
boundary="----=_Part_7C84B8F2_5B84C39F.150DBE9AC97")
return a
def mht(self):
content = urllib2.urlopen(self.url).read()
pmht = MHTHTMLParser()
pmht.feed(content)
pmht.close()
head = self._head()
head.attach(self._add(self.url))
for url in pmht.urls:
head.attach(self._add(url))
return head
def _add(self, url):
m = email.message.Message()
content = None
local_url = None
try:
content = urllib2.urlopen(url)
local_url = url
except:
local_url = self.domain+url
content = urllib2.urlopen(local_url)
content_type = content.headers.dict.get('content-type')
content = content.read()
if content_type and content_type.startswith("text/"):
m["Content-Transfer-Encoding"] = "quoted-printable"
m.set_payload(quopri.encodestring(content).decode("ascii"))
else:
m["Content-Transfer-Encoding"] = "base64"
m.set_payload(base64.b64encode(content).decode("ascii"))
m["Content-Location"] = local_url
m["Content-Type"] = content_type
return m
url = 'http://www.cnblogs.com/weixliu/p/3554868.html'
print URL2MHT(url).mht()
# encoding=utf-8
import base64
import email
import email.message
import mimetypes
import os
import quopri
import sys
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse
import chardet
reload(sys)
sys.setdefaultencoding('utf-8')
class MHTHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
if not tag in ['link']: # , 'script'
return
attrs = dict(attrs)
a = attrs.get('src')
if a and a.find('google') == -1:
self.urls.append((a, attrs.get('type', 'text/javascript')))
elif attrs.get('rel') == 'stylesheet':
self.urls.append(
(attrs.get('href'), attrs.get('type', 'text/css')))
class URL2MHT(object):
def __init__(self, url):
uparse = urlparse(url)
self.domain = uparse.scheme + "://" + uparse.netloc
self.url = url
self.header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
def _head(self):
a = email.message.Message()
a["MIME-Version"] = "1.0"
a["X-UnMHT-Save-State"] = "Current-State"
a.add_header("Content-Type",
"multipart/related",
type="text/html")
return a
def mht(self):
content = urllib2.urlopen(
urllib2.Request(self.url, None, self.header)).read()
pmht = MHTHTMLParser()
pmht.feed(content)
pmht.close()
head = self._head()
head.attach(self._add(self.url, utype='text/html'))
for url, utype in pmht.urls:
head.attach(self._add(url, utype))
return head
def _add(self, url, utype=None):
m = email.message.Message()
content = None
local_url = None
if not urlparse(url).netloc:
local_url = self.domain + url
else:
local_url = url
ctn = None
ecd = None
content = urllib2.urlopen(
urllib2.Request(local_url, None, self.header)).read()
if utype and utype.startswith("text/"):
ecd = "quoted-printable"
ctn = quopri.encodestring(content)
else:
ecd = "base64"
ctn = base64.b64encode(content)
m["Content-Transfer-Encoding"] = ecd
m["Content-Location"] = local_url
m["Content-Type"] = utype
m.set_payload(ctn)
return m
# url = 'http://www.cnblogs.com/weixliu/p/3554868.html'
url = 'http://blog.csdn.net/zhaoyl03/article/details/8631645'
# a = URL2MHT(url).mht().as_string(unixfrom=False)
# print a
# import codecs
# fh = codecs.open("hello.mht", mode="wb", encoding="utf-8")
# fh.write(a)
# fh.close()
x = open('hello.mht').read()
print type(x)
print chardet.detect(x)
x = x.decode('utf-8')
print type(x)
print chardet.detect(x)