openihu
/
从零学Python
mirror of https://gitee.com/52itstyle/Python.git

 
			
							# -*- coding: utf-8 -*-
import codecs
import sys
import tomd
import random
import requests
from bs4 import BeautifulSoup
import mysql_DBUtils
from mysql_DBUtils import MyPymysqlPool


blog_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0",
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
]


# 定义DB
mysql = MyPymysqlPool("dbMysql")


def reptile():
    page = 2
    for n in range(1, page):
        url = "https://www.cnblogs.com/yjmyzz/default.html?page=" + str(n)
        headers = {'User-Agent': random.choice(blog_headers)}
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')
        count = soup.find_all(class_='postTitle')
        for c in count:
            try:
                # 获取博客地址
                href = c.find('a').attrs['href']
                headers = {'User-Agent': random.choice(blog_headers)}
                res = requests.get(href, headers=headers)
                soup = BeautifulSoup(res.text, 'html.parser')
                # 获取博客内容
                content = soup.find('div', class_='blogpost-body')
                # 去掉博客外层的DIV
                content = content.decode_contents(formatter="html")
                # 获取博客标题
                title = soup.find('a', id='cb_post_title_url').text
                # 博客HTML转MD
                content = tomd.Tomd(content).markdown
                # 博客写入数据库
                write_db(title, content, href)
                print("已插入:{}".format(href))
            except Exception as e:
                print(e)


# 写入数据库
def write_db(title, content, url):
    sql = "INSERT INTO blog (title, content,url) VALUES(%(title)s, %(content)s, %(url)s);"
    param = {"title": title, "content": content, "url": url}
    mysql.insert(sql, param)


if __name__ == "__main__":
    reptile()