爬取微信文章

2017-03-13

有时在微信公众号上面看到一些写的比较好的文章，但又没有时间细看，闲下来想找这些文章的时候又忘了是在哪个公众号看的了、文章名字也想不起来，因此想搞个爬虫把想看的文章爬下来，一来可以在闲时咀嚼一下，二来也可以收藏一些好文章，做些知识积累。
只是想把自己平常做的一些东西记录下来，非教程

工具

Python 3.5.1

使用的库

re
pdfkit
requests
BeautifulSoup

功能

输入微信文章名称或者对应的文章链接，输出文章的pdf文件。

思路

如果同时提供文章链接和文章名称，则优先通过文章链接爬取，如果文章链接爬取失败，则通过文章名称爬取；
如果仅提供文章链接，则通过文章链接爬取；
如果仅提供文章名称，则通过搜狗微信接口搜索微信文章，找到对应文章链接，然后在通过文章链接爬取。

爬取流程

获取文章链接

将提供的文章名称传入搜狗微信搜索引擎搜索，将结果列表中的第一篇文章作为目标文章下载。下面代码返回目标文章链接。

def get_article_link(query):
    base_url = r'http://weixin.sogou.com/weixin'
    User_Agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    Host = 'weixin.sogou.com'
    Connection = 'keep-alive'
    headers = {'User-Agent': User_Agent, 'Host': Host, 'Connection': Connection}
    params = {'type': 2, 'ie': 'utf-8', 'w': '01019900', 'sut': '707','query':query}
    request = requests.get(base_url, headers=headers, params=params)
    request.encoding = 'utf-8'
    bsobj = BeautifulSoup(request.text, 'lxml')
    # 仅提取列表中的第一篇文章
    first_article_link = bsobj.select('#sogou_vr_11002601_title_0')[0]['href']
    return first_article_link

将文章转为html

解析文章链接，将文章内容保存为html文件。这里需要注意的是，在解析文章的时候，如果文章中包含有图片的话，正常情况下是无法下载下来的，因为爬取的文章链接为临时链接，非永久链接，无法直接解析src里面的链接。但是，data-src这个属性的值还是可以解析出来的，所以只要把data-src替换为src就可以下载图片了。

def get_article_html(link):
    # 为了保险起见，这里使用不同的headers
    User_Agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'
    article_headers = {'User_Agent': User_Agent}
    article_obj = requests.get(link, headers=article_headers)
    article_obj.encoding = 'utf-8'
    soup = BeautifulSoup(article_obj.content, 'html5lib')
    # 以实际的文章名称为准
    article_name = soup.select('#activity-name')[0].text.strip()
    content = soup.find('div', {'id': 'page-content'})
    html = str(content)
    # 把属性data-src替换成src,前面无法将属性src解析出来，data-src，只是LAZY用的，
    # 延迟加载图片所以显示不出来，LAZYLOAD
    src_compile = re.compile('data-src')
    html_new = re.sub(src_compile, 'src', html)
    # 存储成html
    with open('wechat_article.html', 'w', encoding='GB18030') as f:
        f.write(html_new)
    return article_name

html转pdf

html文件转pdf调用了pdfkit这个包，使用这个包需要安装wkhtmltopdf软件（pdfkit依赖于wkhtmltopdf，因此需要配置路径）。
在运行过程中，发现pdfkit在html转pdf时，生成的pdf文件名中如果包含有| / *这些特殊符号时会报错，因此如果以原文章名对pdf命名失败时，仅保留文章名的汉字、字母和数字进行命名。

def html_to_pdf(query_article):
    path_wk = r'D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
    config = pdfkit.configuration(wkhtmltopdf=path_wk)
    options = {
        'page-size': 'Letter',
        'encoding': "GB18030",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ]
    }
    try :
        pdfkit.from_file('wechat_article.html', '%s.pdf' % query_article, configuration=config, options=options)
    except:
        name_compile = re.compile('[a-zA-Z\u4e00-\u9fa5][a-zA-Z0-9\u4e00-\u9fa5]+')
        pdf_name = re.findall(name_compile,query_article)[0]
        pdfkit.from_file('wechat_article.html', '%s.pdf' % pdf_name, configuration=config, options=options)
        print('文件名已被修改为:%s' %pdf_name)

源代码

最后附上文章爬取的完整代码。

#coding:utf-8
#author:linchart
import requests
from bs4 import BeautifulSoup
import re
import pdfkit
def get_article_link(query):
    base_url = r'http://weixin.sogou.com/weixin'
    User_Agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    Host = 'weixin.sogou.com'
    Connection = 'keep-alive'
    headers = {'User-Agent': User_Agent, 'Host': Host, 'Connection': Connection}
    params = {'type': 2, 'ie': 'utf-8', 'w': '01019900', 'sut': '707','query':query}
    request = requests.get(base_url, headers=headers, params=params)
    request.encoding = 'utf-8'
    bsobj = BeautifulSoup(request.text, 'lxml')
    # 仅提取列表中的第一篇文章
    first_article_link = bsobj.select('#sogou_vr_11002601_title_0')[0]['href']
    return first_article_link
def get_article_html(link):
    # 需要不同的headers
    User_Agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'
    article_headers = {'User_Agent': User_Agent}
    article_obj = requests.get(link, headers=article_headers)
    article_obj.encoding = 'utf-8'
    soup = BeautifulSoup(article_obj.content, 'html5lib')
    # 以实际的文章名称为准
    article_name = soup.select('#activity-name')[0].text.strip()
    content = soup.find('div', {'id': 'page-content'})
    html = str(content)
    # 把属性data-src替换成src,前面无法将属性src解析出来，data-src，只是LAZY用的，
    # 延迟加载图片所以显示不出来，LAZYLOAD
    src_compile = re.compile('data-src')
    html_new = re.sub(src_compile, 'src', html)
    # 存储成html
    with open('wechat_article.html', 'w', encoding='GB18030') as f:
        f.write(html_new)
    return article_name
def html_to_pdf(query_article):
    path_wk = r'D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
    config = pdfkit.configuration(wkhtmltopdf=path_wk)
    options = {
        'page-size': 'Letter',
        'encoding': "GB18030",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ]
    }
    try :
        pdfkit.from_file('wechat_article.html', '%s.pdf' % query_article, configuration=config, options=options)
    except:
        name_compile = re.compile('[a-zA-Z\u4e00-\u9fa5][a-zA-Z0-9\u4e00-\u9fa5]+')
        pdf_name = re.findall(name_compile,query_article)[0]
        pdfkit.from_file('wechat_article.html', '%s.pdf' % pdf_name, configuration=config, options=options)
        print('文件名已被修改为:%s' %pdf_name)
def wechat_article(query=None,link=None):
    if link :
        try :
            article_name = get_article_html(link)
            html_to_pdf(article_name)
            print('文章下载成功')
        except :
            article_link = get_article_link(query)
            get_article_html(article_link)
            html_to_pdf(query)
            print('文章下载成功')
    else :
        article_link = get_article_link(query)
        get_article_html(article_link)
        html_to_pdf(query)
        print('文章下载成功')
# PDF可以用中文命名，但是命名中不可以包含* \/|等特殊字符。
if __name__ == '__main__':
    link = None
    query = '文本分析|词频与余弦相似度'
    wechat_article(query=query,link=link)