初识python | python作品分享 -金年会app官方网

import datetime
import re
import pymysql
import requests
from bs4 import beautifulsoup
def spider():
    url = "https://www.bbiquge.net/"
    html = requests.get(url)
    html.encoding = 'gbk'
    text = html.text
    bs = beautifulsoup(text, 'lxml')
    box = bs.select("#mainleft .titletop")
    db = conn()
    query = db.cursor()
    for item in box:
        category = item.select('h3')[0].string
        time = datetime.datetime.now().strftime('%y-%m-%d  %h:%m:%s')
        sql = 'insert into category(name,created_at) values (%s,%s)'
        query.execute(sql, (category, time))
        insert_id = db.insert_id()
        handler_top(item, insert_id, query, db)
        li = item.select("ul li")
        del li[:1]
        for i in li:
            book_id, link = handler_li(i, insert_id, query, db)
            handler_chapter(book_id, link, query, db)
def handler_top(content, insert_id, query, db):
    print("-----------开始采集top--------")
    top = content.select("ul li")[0]
    title = top.select(".text strong a")
    name = title[0].string
    link = title[0]['href']
    author_str = top.select(".text p")
    category_id = insert_id
    pattern = re.compile("(?<=作者:).*?(?=
)"
) s = str(author_str[0]) m = pattern.search(s) author = m.group() book_sql = 'insert into books(name,author,link,category_id) values (%s,%s,%s,%s)' query.execute(book_sql, (name, author, link, category_id)) book_id = db.insert_id() handler_chapter(book_id, link, query, db) def handler_li(content, insert_id, query, db): print("-----------开始采集书本名称--------") name = content.select("a")[0].string link = content.select("a")[0]['href'] category_id = insert_id author = content.select("span", class_="author")[0].string book_sql = 'insert into books(name,author,link,category_id) values (%s,%s,%s,%s)' query.execute(book_sql, (name, author, link, category_id)) book_id = db.insert_id() return book_id, link def handler_chapter(book_id, link, query, db): print("-----------开始采集章节内容--------" link) page_html = requests.get(link) page_text = page_html.text bs = beautifulsoup(page_text, 'lxml') pages = bs.find("select", "form-control").find_all("option") for page in range(1, len(pages)): url = link "index_" str(page) ".html" print("-----------开始采集章节页码--------" url) chapter_html = requests.get(url) chapter_text = chapter_html.text bs = beautifulsoup(chapter_text, 'lxml') dd = bs.select("dl dd") for d in dd: href = d.select("a")[0]["href"] url = link href print("-----------开始采集内容--------" url) headers = { 'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) ' 'chrome/112.0.0.0 safari/537.36', 'x-client-data': "ckk1yqeiilbjaqiitskbcmg2yqeiqz3kaqj5k8sbcjshywei/krmaqic/swbciwgzqeivqlnaq==" } content_html = requests.get(url, headers=headers) content_html.encoding = 'gbk' content_text = content_html.text bs = beautifulsoup(content_text, 'lxml') article = bs.find("div", id="content").text name = bs.find("h1").text page_size = page old_chapter = href.split(".", 1)[0] lk = url created_at = datetime.datetime.now().strftime('%y-%m-%d %h:%m:%s') bid = book_id content_sql = 'insert into chapter(name,link,old_chapter,content,page,created_at,book_id)' \ ' values (%s,%s,%s,%s,%s,%s,%s)' query.execute(content_sql, (name, lk, old_chapter, article, page_size, created_at, bid)) db.commit() print("-----------采集完一条内容------------") def conn(): try: db = pymysql.connect( host='127.0.0.1', port=3306, user='root', passwd='root', db='stories', charset='utf8' ) return db except exception as b: print(b.args) if __name__ == '__main__': try: spider() except exception as e: print(e.args)
讨论数量: 1

下一步 学习使用多线程采集,下下一步学习做一个桌面工具采集

1年前

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!
网站地图