临时保存呢

starli 发表于 2022-04-27
#爬取 起点全部作品
#爬取书籍信息，并且写入数据库
#去重判断
#1.书籍判断：通过书名和作者判断
#1.1读取数据库中的书名和作者
#1.2获取采集的书名和作者
#对两个源的数据进行判断,写入最后一次采集的数据

import requests
import re
import pymysql

#定义数据库连接
db = pymysql.connect(
    host = '127.0.0.1',
    user = 'root',
    password = '123456',
    port = 3306,
    database = 'bookmanager',
    )

siteid = 1
baseurl = 'https://www.qidian.com/'


headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'}         
html = requests.get(baseurl, headers=headers)
if html.status_code == 200:
    html_text = html.text
    
    #找到所有书籍页面
    res = re.findall(r'href="//book.qidian.com/info/(\d+)/"', html_text, re.DOTALL)
    for i in res:
        
        #书籍ID:
        sourceid = i 
        print('源站ID:', sourceid)
        
        #书籍地址：
        bookurl = 'https://book.qidian.com/info/'+ i +'/'
        print('书籍地址:', bookurl)
        
        for j in bookurl:
            
            bookhtml = requests.get(bookurl, headers=headers)
            booktml_text = bookhtml.text
            
            #1.标题
            articlename = re.findall(r'<meta property="og:title" content="([\s\S]*?)" />\s+<meta property="og:description', booktml_text,re.DOTALL)[0]
            print('书名:{0}'.format(articlename))
            
            #2.作者
            author = re.findall(r'<meta property="og:novel:author" content="([\s\S]*?)" />\s+<meta property="og:novel:book_n',booktml_text, re.DOTALL)[0]
            print('作者:{0}'.format(author))
                                  
            #3.分类
            sort = re.findall(r'<meta property="og:novel:category" content="([\s\S]*?)" />\s+<meta property="og:novel:author" cont',booktml_text, re.DOTALL)[0]
            print('分类:{0}'.format(sort))            
            #4.更新状态
            status = re.findall(r'<meta property="og:novel:status" content="([\s\S]*?)" />\s+<meta property="og:novel:author_link',booktml_text, re.DOTALL)[0]
            print('状态:{0}'.format(status))
            
            #5.最新章节
            lastchapter = re.findall(r'<meta property="og:novel:latest_chapter_name" content="([\s\S]*?)" />\s+<meta property="og:novel:latest_chapter_url',booktml_text, re.DOTALL)[0]
            print('最新章节:{0}'.format(lastchapter))
            
            #5.简介
            jianjie = re.findall(r'<meta property="og:description" content="([\s\S]*?)" />\s+<meta property="og:image"',booktml_text, re.DOTALL)[0]
            print('简介:{0}'.format(jianjie))
            
            #6.封面图片地址
            img = re.findall(r' <meta property="og:image" content="([\s\S]*?)" />\s+<meta property="og:novel:category"',booktml_text, re.DOTALL)[0]
            print('封面地址:https:{0}'.format(img))
            
            #7.标签
            tag = re.findall(r'<a class="tags" href="//www.qidian.com/all/tag.+?/" target="_blank" data-eid="qd_G70">([\s\S]*?)</a>',booktml_text, re.DOTALL)[0]
            
        
            print('采集完成，进行下一本中')
            print('============采集完成，开始写入数据=====================')
            articleid = 'NULL'
            #1.插入数据
            try:
                with db.cursor() as cursor:
                    #1.根据采集到的书名，去查询数据库的同名书籍和作者
                    print('采集到的书名是:',articlename)
                    print('开始在数据库中查询书名为：《{0}》的书籍'.format(articlename))
                    select = ("select * from article where articlename = '%s' and author ='%s'" % (articlename,author))
                    print('执行sql语句：',select)
                    cursor.execute(select)
                    result = cursor.fetchone()
                    for row in result:
                        #print([articlename],[author])
                        old_articlename = articlename
                        old_author = author 
                        print(old_articlename,old_author)
                        #print("articlename=%s, author=%s" % (row['articlename'], row['author']))
                        if old_articlename == articlename and old_author == author:
                            print("书籍已经存在，检查更新内容")
                        else:
                            print('书籍不存在，向数据库中添加书籍')
                            insert_info = 'insert into article (siteid,sourceid,articlename,author,sort,status,lastchapter,jianjie,img,tag) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
                            data = (siteid,sourceid,articlename,author,sort,status,lastchapter,jianjie,img,tag)
                            print('写入书籍信息')
                            sql = cursor.execute(insert_info,data)
                            db.commit()
            except Exception as e:
                print(e)
                db.rollback()
            finally:
                cursor.close()
                db.close()
            break
目录