#爬取 起点全部作品 #爬取书籍信息,并且写入数据库 #去重判断 #1.书籍判断:通过书名和作者判断 #1.1读取数据库中的书名和作者 #1.2获取采集的书名和作者 #对两个源的数据进行判断,写入最后一次采集的数据 import requests import re import pymysql #定义数据库连接 db = pymysql.connect( host = '127.0.0.1', user = 'root', password = '123456', port = 3306, database = 'bookmanager', ) siteid = 1 baseurl = 'https://www.qidian.com/' headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'} html = requests.get(baseurl, headers=headers) if html.status_code == 200: html_text = html.text #找到所有书籍页面 res = re.findall(r'href="//book.qidian.com/info/(\d+)/"', html_text, re.DOTALL) for i in res: #书籍ID: sourceid = i print('源站ID:', sourceid) #书籍地址: bookurl = 'https://book.qidian.com/info/'+ i +'/' print('书籍地址:', bookurl) for j in bookurl: bookhtml = requests.get(bookurl, headers=headers) booktml_text = bookhtml.text #1.标题 articlename = re.findall(r'<meta property="og:title" content="([\s\S]*?)" />\s+<meta property="og:description', booktml_text,re.DOTALL)[0] print('书名:{0}'.format(articlename)) #2.作者 author = re.findall(r'<meta property="og:novel:author" content="([\s\S]*?)" />\s+<meta property="og:novel:book_n',booktml_text, re.DOTALL)[0] print('作者:{0}'.format(author)) #3.分类 sort = re.findall(r'<meta property="og:novel:category" content="([\s\S]*?)" />\s+<meta property="og:novel:author" cont',booktml_text, re.DOTALL)[0] print('分类:{0}'.format(sort)) #4.更新状态 status = re.findall(r'<meta property="og:novel:status" content="([\s\S]*?)" />\s+<meta property="og:novel:author_link',booktml_text, re.DOTALL)[0] print('状态:{0}'.format(status)) #5.最新章节 lastchapter = re.findall(r'<meta property="og:novel:latest_chapter_name" content="([\s\S]*?)" />\s+<meta property="og:novel:latest_chapter_url',booktml_text, re.DOTALL)[0] print('最新章节:{0}'.format(lastchapter)) #5.简介 jianjie = re.findall(r'<meta property="og:description" content="([\s\S]*?)" />\s+<meta property="og:image"',booktml_text, re.DOTALL)[0] print('简介:{0}'.format(jianjie)) #6.封面图片地址 img = re.findall(r' <meta property="og:image" content="([\s\S]*?)" />\s+<meta property="og:novel:category"',booktml_text, re.DOTALL)[0] print('封面地址:https:{0}'.format(img)) #7.标签 tag = re.findall(r'<a class="tags" href="//www.qidian.com/all/tag.+?/" target="_blank" data-eid="qd_G70">([\s\S]*?)</a>',booktml_text, re.DOTALL)[0] print('采集完成,进行下一本中') print('============采集完成,开始写入数据=====================') articleid = 'NULL' #1.插入数据 try: with db.cursor() as cursor: #1.根据采集到的书名,去查询数据库的同名书籍和作者 print('采集到的书名是:',articlename) print('开始在数据库中查询书名为:《{0}》的书籍'.format(articlename)) select = ("select * from article where articlename = '%s' and author ='%s'" % (articlename,author)) print('执行sql语句:',select) cursor.execute(select) result = cursor.fetchone() for row in result: #print([articlename],[author]) old_articlename = articlename old_author = author print(old_articlename,old_author) #print("articlename=%s, author=%s" % (row['articlename'], row['author'])) if old_articlename == articlename and old_author == author: print("书籍已经存在,检查更新内容") else: print('书籍不存在,向数据库中添加书籍') insert_info = 'insert into article (siteid,sourceid,articlename,author,sort,status,lastchapter,jianjie,img,tag) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' data = (siteid,sourceid,articlename,author,sort,status,lastchapter,jianjie,img,tag) print('写入书籍信息') sql = cursor.execute(insert_info,data) db.commit() except Exception as e: print(e) db.rollback() finally: cursor.close() db.close() break