#爬取 起点全部作品
#爬取书籍信息,并且写入数据库
#去重判断
#1.书籍判断:通过书名和作者判断
#1.1读取数据库中的书名和作者
#1.2获取采集的书名和作者
#对两个源的数据进行判断,写入最后一次采集的数据
import requests
import re
import pymysql
#定义数据库连接
db = pymysql.connect(
host = '127.0.0.1',
user = 'root',
password = '123456',
port = 3306,
database = 'bookmanager',
)
siteid = 1
baseurl = 'https://www.qidian.com/'
headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'}
html = requests.get(baseurl, headers=headers)
if html.status_code == 200:
html_text = html.text
#找到所有书籍页面
res = re.findall(r'href="//book.qidian.com/info/(\d+)/"', html_text, re.DOTALL)
for i in res:
#书籍ID:
sourceid = i
print('源站ID:', sourceid)
#书籍地址:
bookurl = 'https://book.qidian.com/info/'+ i +'/'
print('书籍地址:', bookurl)
for j in bookurl:
bookhtml = requests.get(bookurl, headers=headers)
booktml_text = bookhtml.text
#1.标题
articlename = re.findall(r'<meta property="og:title" content="([\s\S]*?)" />\s+<meta property="og:description', booktml_text,re.DOTALL)[0]
print('书名:{0}'.format(articlename))
#2.作者
author = re.findall(r'<meta property="og:novel:author" content="([\s\S]*?)" />\s+<meta property="og:novel:book_n',booktml_text, re.DOTALL)[0]
print('作者:{0}'.format(author))
#3.分类
sort = re.findall(r'<meta property="og:novel:category" content="([\s\S]*?)" />\s+<meta property="og:novel:author" cont',booktml_text, re.DOTALL)[0]
print('分类:{0}'.format(sort))
#4.更新状态
status = re.findall(r'<meta property="og:novel:status" content="([\s\S]*?)" />\s+<meta property="og:novel:author_link',booktml_text, re.DOTALL)[0]
print('状态:{0}'.format(status))
#5.最新章节
lastchapter = re.findall(r'<meta property="og:novel:latest_chapter_name" content="([\s\S]*?)" />\s+<meta property="og:novel:latest_chapter_url',booktml_text, re.DOTALL)[0]
print('最新章节:{0}'.format(lastchapter))
#5.简介
jianjie = re.findall(r'<meta property="og:description" content="([\s\S]*?)" />\s+<meta property="og:image"',booktml_text, re.DOTALL)[0]
print('简介:{0}'.format(jianjie))
#6.封面图片地址
img = re.findall(r' <meta property="og:image" content="([\s\S]*?)" />\s+<meta property="og:novel:category"',booktml_text, re.DOTALL)[0]
print('封面地址:https:{0}'.format(img))
#7.标签
tag = re.findall(r'<a class="tags" href="//www.qidian.com/all/tag.+?/" target="_blank" data-eid="qd_G70">([\s\S]*?)</a>',booktml_text, re.DOTALL)[0]
print('采集完成,进行下一本中')
print('============采集完成,开始写入数据=====================')
articleid = 'NULL'
#1.插入数据
try:
with db.cursor() as cursor:
#1.根据采集到的书名,去查询数据库的同名书籍和作者
print('采集到的书名是:',articlename)
print('开始在数据库中查询书名为:《{0}》的书籍'.format(articlename))
select = ("select * from article where articlename = '%s' and author ='%s'" % (articlename,author))
print('执行sql语句:',select)
cursor.execute(select)
result = cursor.fetchone()
for row in result:
#print([articlename],[author])
old_articlename = articlename
old_author = author
print(old_articlename,old_author)
#print("articlename=%s, author=%s" % (row['articlename'], row['author']))
if old_articlename == articlename and old_author == author:
print("书籍已经存在,检查更新内容")
else:
print('书籍不存在,向数据库中添加书籍')
insert_info = 'insert into article (siteid,sourceid,articlename,author,sort,status,lastchapter,jianjie,img,tag) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
data = (siteid,sourceid,articlename,author,sort,status,lastchapter,jianjie,img,tag)
print('写入书籍信息')
sql = cursor.execute(insert_info,data)
db.commit()
except Exception as e:
print(e)
db.rollback()
finally:
cursor.close()
db.close()
break