@bergus
2015-11-08T14:24:31.000000Z
字数 3825
阅读 2884
python mht 网络链接 mime
import base64import emailimport email.messageimport mimetypesimport osimport quopriimport sysimport urllib2from HTMLParser import HTMLParserfrom urlparse import urlparseclass MHTHTMLParser(HTMLParser):def __init__(self):HTMLParser.__init__(self)self.urls = []def handle_starttag(self, tag, attrs):if not tag in ['link','script']:returnattrs = dict(attrs)if 'src' in attrs.keys():a = attrs.get('src')if a and a.find('google') == -1:self.urls.append(a)elif 'stylesheet' in attrs.values():self.urls.append(attrs.get('href'))class URL2MHT(object):def __init__(self,url):self.domain = url.split(urlparse(url).path)[0]self.url = urldef _head(self):a = email.message.Message()a["MIME-Version"] = "1.0"a["X-UnMHT-Save-State"] = "Current-State"a.add_header("Content-Type","multipart/related",type="text/html",boundary="----=_Part_7C84B8F2_5B84C39F.150DBE9AC97")return adef mht(self):content = urllib2.urlopen(self.url).read()pmht = MHTHTMLParser()pmht.feed(content)pmht.close()head = self._head()head.attach(self._add(self.url))for url in pmht.urls:head.attach(self._add(url))return headdef _add(self, url):m = email.message.Message()content = Nonelocal_url = Nonetry:content = urllib2.urlopen(url)local_url = urlexcept:local_url = self.domain+urlcontent = urllib2.urlopen(local_url)content_type = content.headers.dict.get('content-type')content = content.read()if content_type and content_type.startswith("text/"):m["Content-Transfer-Encoding"] = "quoted-printable"m.set_payload(quopri.encodestring(content).decode("ascii"))else:m["Content-Transfer-Encoding"] = "base64"m.set_payload(base64.b64encode(content).decode("ascii"))m["Content-Location"] = local_urlm["Content-Type"] = content_typereturn murl = 'http://www.cnblogs.com/weixliu/p/3554868.html'print URL2MHT(url).mht()# encoding=utf-8import base64import emailimport email.messageimport mimetypesimport osimport quopriimport sysimport urllib2from HTMLParser import HTMLParserfrom urlparse import urlparseimport chardetreload(sys)sys.setdefaultencoding('utf-8')class MHTHTMLParser(HTMLParser):def __init__(self):HTMLParser.__init__(self)self.urls = []def handle_starttag(self, tag, attrs):if not tag in ['link']: # , 'script'returnattrs = dict(attrs)a = attrs.get('src')if a and a.find('google') == -1:self.urls.append((a, attrs.get('type', 'text/javascript')))elif attrs.get('rel') == 'stylesheet':self.urls.append((attrs.get('href'), attrs.get('type', 'text/css')))class URL2MHT(object):def __init__(self, url):uparse = urlparse(url)self.domain = uparse.scheme + "://" + uparse.netlocself.url = urlself.header = {'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}def _head(self):a = email.message.Message()a["MIME-Version"] = "1.0"a["X-UnMHT-Save-State"] = "Current-State"a.add_header("Content-Type","multipart/related",type="text/html")return adef mht(self):content = urllib2.urlopen(urllib2.Request(self.url, None, self.header)).read()pmht = MHTHTMLParser()pmht.feed(content)pmht.close()head = self._head()head.attach(self._add(self.url, utype='text/html'))for url, utype in pmht.urls:head.attach(self._add(url, utype))return headdef _add(self, url, utype=None):m = email.message.Message()content = Nonelocal_url = Noneif not urlparse(url).netloc:local_url = self.domain + urlelse:local_url = urlctn = Noneecd = Nonecontent = urllib2.urlopen(urllib2.Request(local_url, None, self.header)).read()if utype and utype.startswith("text/"):ecd = "quoted-printable"ctn = quopri.encodestring(content)else:ecd = "base64"ctn = base64.b64encode(content)m["Content-Transfer-Encoding"] = ecdm["Content-Location"] = local_urlm["Content-Type"] = utypem.set_payload(ctn)return m# url = 'http://www.cnblogs.com/weixliu/p/3554868.html'url = 'http://blog.csdn.net/zhaoyl03/article/details/8631645'# a = URL2MHT(url).mht().as_string(unixfrom=False)# print a# import codecs# fh = codecs.open("hello.mht", mode="wb", encoding="utf-8")# fh.write(a)# fh.close()x = open('hello.mht').read()print type(x)print chardet.detect(x)x = x.decode('utf-8')print type(x)print chardet.detect(x)
