www.dy2018.com
category:存储分类
video:存储视屏的信息
link:存储链接
因为该网站网页不规范,所以用到了很多判断、正则来获取正确链接,但估计依然有没爬取到的,也就这样吧。不过这个网站页面结构相对简单,感觉很适合练手
结果共爬取28000+视屏,14w+链接,爬了几十个小时(我在中间有sleep),下面是所有代码
import time
import requests
from bs4 import BeautifulSoup
import re
import pymysql
def printTime(timeFloat,hasFinish=''):
timeInt = round(timeFloat)
timeHour = timeInt // (60 * 60)
timeMinute = (timeInt - timeHour * 60 * 60) // 60
timeSecond = timeInt - timeMinute * 60 - timeHour * 60 * 60
if timeInt < 60:
print('已用时:' + str(timeInt) + ' s',end='')
elif timeHour < 1:
print('已用时:' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')
else:
print('已用时:' + str(timeHour) + ' h ' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')
print(' 完成度'+hasFinish)
class eachVideo:
def __init__(self):
self.insertCategory = 'insert into category (name) values (%s)'
self.insertVideo = 'insert into video (name,title,cid) values (%s,%s,%s)'
self.insertLink = 'insert into link (link,vid) values (%s,%s)'
self.selectVideo = 'select id from video where title=%s'
self.selectCategory = 'select id from category where name=%s'
# 获取数据库连接
def getDB(self):
db = pymysql.connect(user='root', password='199508', database='myresource')
return db
# 获取soup对象
def getSoup(self, url):
headers = {'user-agent': 'Mozilla/5.0 (windows NT 10.0; WOW64) APPleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.132 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept - encoding': 'gzip, deflate, br',
'cookie': '_ga=GA1.2.412668155.1542004912; gr_user_id=b54b3fbb-3005-4021-9191-49961f0925e5; '
'_gid=GA1.2.497245283.1542793697; Hm_lvt_a68dc87e09b2a989eec1a0669bfd59eb=1542437077,'
'1542793697,1542863682,1542960832; pescdfeedbackbid=2; '
'pescdcheckfeedbackkey=1543013870%2C51ac4fa341dda1cbfc464c9eb8b7270a'
'%2C7673df03288dcb33602cccfb14489466; XLA_CI=35a00c84ce21862d2edb13445a8675c8; '
'pescdlastsearchtime=1543041447; '
'gr_session_id_bce67daadd1e4d71=212d307a-bba4-44a4-88ba-da5684fa84e5; '
'gr_session_id_bce67daadd1e4d71_212d307a-bba4-44a4-88ba-da5684fa84e5=true; '
'Hm_lpvt_a68dc87e09b2a989eec1a0669bfd59eb=1543130349'}
response = requests.get(url, headers=headers, timeout=2)
try:
html = response.content.decode('gb2312')
except unicodeDecodeERROR:
html = response.content.decode('gbk')
soup = BeautifulSoup(html, 'lxml')
return soup
# 获取标题
def getTitle(self, soup):
title = soup.find('h1').text
return title
# 获取名字
def getName(self, title):
try:
name = re.search('《(.*?)》', title).group(1)
except AttributeError:
name = title
return name
# 获取图片
def getPic(self, soup, name, vid):
imageLink = soup.find('img').get('src')
if imageLink is not None:
try:
image = requests.get(imageLink, timeout=1)
path = 'E:\图片\dy2018\\' + str(vid) + '.jpg'
with open(path, 'wb') as pic:
pic.write(image.content)
except Exception:
print(' ' + name + '---图片下载失败')
else:
print(' ' + name + '---该视频没有海报')
# 获取链接
def getLink(self, soup, vid):
link = soup.find_all('td')
links = []
for l in link:
try:
if re.search('word-wrap.*?', l.get('style').lower()) is not None:
links.append([l.text.strip(), vid])
except AttributeError:
continue
return links
# 插入video
def execute(self, url, cid):
db = self.getDB()
cursor = db.cursor()
soup = self.getSoup(url)
title = self.getTitle(soup)
name = self.getName(title)
cursor.execute(self.selectVideo, title)
titleDB = cursor.fetchone()
if titleDB is None:
cursor.execute(self.insertVideo, (name.strip(), title.strip(), cid))
vid = cursor.lastrowid
links = self.getLink(soup, vid)
if len(links) > 0:
cursor.executemany(self.insertLink, links)
self.getPic(soup, name, vid)
else:
print(' ' + name + '---无法获取链接')
db.rollback()
print('--' + name + '--已完成')
else:
print('!!!!!!' + name + '已存在!!!!!!')
db.commit()
db.close()
# 获取分类每一页页面链接
def getEachVideoLinks(self, cateUrl):
soup = self.getSoup(cateUrl)
urls = soup.find_all(attrs={'class': 'ulink'})
trueUrls = []
for url in urls:
trueUrl = url.get('href')
if re.match('.*?\.html', trueUrl) is not None:
trueUrls.append(trueUrl)
return trueUrls
# 获取分类所有页面链接
def getEveryVideoLinks(self, cateUrl):
soup = self.getSoup(cateUrl).text
pageCount = re.search('页次.*?\d*?/(\d*)', soup).group(1)
pageNums = ['']
for i in range(2, int(pageCount) + 1):
pageNums.append('_' + str(i))
everyTrueUrls = []
for num in pageNums:
url = cateUrl + '/index' + num + '.html'
try:
everyTrueUrls += self.getEachVideoLinks(url)
print(url + '获取页面链接成功')
except Exception:
try:
everyTrueUrls += self.getEachVideoLinks(url)
print(url + '获取页面链接成功')
except Exception:
print('+++++++++++++++重要' + url + '失败+++++++++++++++')
continue
return everyTrueUrls
# 获取所有分类
def getCategory(self):
categorys = []
for i in range(8, 21):
categorys.append(str(i))
categorys.append('html/tv/hytv')
categorys.append('html/tv/hepai')
categorys.append('html/tv/gangtai')
categorys.append('html/tv/oumeitv')
categorys.append('html/tv/rihantv')
categorys.append('html/zongyi2013')
categorys.append('html/2009zongyi')
categorys.append('html/dongman')
categorys.append('html/game')
categorys.append('html/3gp')
return categorys
# 获取所有分类的所有页面的所有连接
def getAllVideoLink(self, categorys):
timeBegin = time.time()
for i in range(0, len(categorys)):
# 获取分类名称
url = 'https://www.dy2018.com/' + categorys[i]
try:
soup = self.getSoup(url)
except Exception:
try:
soup = self.getSoup(url)
except Exception:
print('+++++++++++++++++重要' + url + '失败+++++++++++++++++++++')
continue
# if i < 20:
# titleAll = soup.find('h1').text
# categoryTitle = re.search('>(.*?)>', titleAll).group(1).strip()
# elif i < 25:
# titleAll = soup.find('h1').find_all('a')
# categoryTitle = titleAll[2].text
# else:
titleAll = soup.find('h1').find_all('a')
categoryTitle = titleAll[1].text
db = self.getDB()
cursor = db.cursor()
cursor.execute(self.selectCategory, categoryTitle.strip())
ca = cursor.fetchone()
if ca is None:
cursor.execute(self.insertCategory, categoryTitle.strip())
else:
print(categoryTitle + ' 已 存 在')
cid = cursor.lastrowid
db.commit()
db.close()
try:
everyUrls = self.getEveryVideoLinks(url)
except Exception:
try:
everyUrls = self.getEveryVideoLinks(url)
except Exception:
print('++++++++++++++++重要' + url + '失败++++++++++++++++')
continue
timeGetUrls = time.time()
printTime(timeGetUrls - timeBegin)
for everyUrl in everyUrls:
videoUrl = 'https://www.dy2018.com/' + everyUrl
try:
self.execute(videoUrl, cid)
except Exception:
try:
self.execute(videoUrl, cid)
except Exception as e:
print(e)
continue
timeFinishOne = time.time()
hasFinish=str(everyUrls.index(everyUrl)+1)+' / '+str(len(everyUrls))
printTime(timeFinishOne - timeBegin,hasFinish)
# time.sleep(0.7)
print('-------------------------' + categoryTitle + '已完成----------------------------')
if __name__ == '__main__':
video = eachVideo()
categorys = video.getCategory()
video.getAllVideoLink(categorys)
相关阅读
x = [1, 2, 3] y = [] y.append(x) x.append(9) y.append(x) print(y)期望结果是 [[1, 2, 3], [1, 2, 3, 9]]实际的输出结果是 [
陈末(邓超 饰)被称为全城最贱,每天和王牌DJ小容(杜鹃 饰)针锋相对,谁也不知道他们的仇恨从何而来。陈末的两个兄弟,分别是全城最傻的猪头
网络大电影最近几年都很火,《煎饼侠》10亿票房,《万万没想到·西游篇》3亿多票房,这无疑是PUGC沉淀内容的一种趋势。今年还有两个更
一部写实聋哑学校的孩子被老师性侵虐待的电影,引发了韩国整个国家的思考讨论觉醒,推进了立法的进程,拯救了在苦难中的未成年聋哑人。
3.1序列简介• 数据结构是通过某种方式组织在一起的元素的集合。• 容器(Container)是一种Python的数据结构,基本上是包含