「www.dy2018.com」python爬取电影天堂（www.dy2018.com）所有视屏的所有链接

www.dy2018.com

用到的库为requests,bs4,re,pymy sql
目的是将链接存入数据库，数据库分为三张表：
category：存储分类
video：存储视屏的信息
link：存储链接
因为该网站网页不规范，所以用到了很多判断、正则来获取正确链接，但估计依然有没爬取到的，也就这样吧。不过这个网站页面结构相对简单，感觉很适合练手
结果共爬取28000+视屏，14w+链接，爬了几十个小时（我在中间有sleep），下面是所有代码
import time

import requests
from bs4 import BeautifulSoup
import re
import pymysql


def printTime(timeFloat,hasFinish=''):
    timeInt = round(timeFloat)
    timeHour = timeInt // (60 * 60)
    timeMinute = (timeInt - timeHour * 60 * 60) // 60
    timeSecond = timeInt - timeMinute * 60 - timeHour * 60 * 60
    if timeInt < 60:
        print('已用时：' + str(timeInt) + ' s',end='')
    elif timeHour < 1:
        print('已用时：' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')
    else:
        print('已用时：' + str(timeHour) + ' h ' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')

    print('   完成度'+hasFinish)


class eachVideo:
    def __init__(self):
        self.insertCategory = 'insert into category (name) values (%s)'
        self.insertVideo = 'insert into video (name,title,cid) values (%s,%s,%s)'
        self.insertLink = 'insert into link (link,vid) values (%s,%s)'
        self.selectVideo = 'select id from video where title=%s'
        self.selectCategory = 'select id from category where name=%s'

    # 获取数据库连接
    def getDB(self):
        db = pymysql.connect(user='root', password='199508', database='myresource')
        return db

    # 获取soup对象
    def getSoup(self, url):
        headers = {'user-agent': 'Mozilla/5.0 (windows NT 10.0; WOW64) APPleWebKit/537.36 (KHTML, like Gecko) '
                                 'Chrome/63.0.3239.132 Safari/537.36',
                   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                   'accept - encoding': 'gzip, deflate, br',
                   'cookie': '_ga=GA1.2.412668155.1542004912; gr_user_id=b54b3fbb-3005-4021-9191-49961f0925e5; '
                             '_gid=GA1.2.497245283.1542793697; Hm_lvt_a68dc87e09b2a989eec1a0669bfd59eb=1542437077,'
                             '1542793697,1542863682,1542960832; pescdfeedbackbid=2; '
                             'pescdcheckfeedbackkey=1543013870%2C51ac4fa341dda1cbfc464c9eb8b7270a'
                             '%2C7673df03288dcb33602cccfb14489466; XLA_CI=35a00c84ce21862d2edb13445a8675c8; '
                             'pescdlastsearchtime=1543041447; '
                             'gr_session_id_bce67daadd1e4d71=212d307a-bba4-44a4-88ba-da5684fa84e5; '
                             'gr_session_id_bce67daadd1e4d71_212d307a-bba4-44a4-88ba-da5684fa84e5=true; '
                             'Hm_lpvt_a68dc87e09b2a989eec1a0669bfd59eb=1543130349'}
        response = requests.get(url, headers=headers, timeout=2)
        try:
            html = response.content.decode('gb2312')
        except unicodeDecodeERROR:
            html = response.content.decode('gbk')
        soup = BeautifulSoup(html, 'lxml')
        return soup

    # 获取标题
    def getTitle(self, soup):
        title = soup.find('h1').text
        return title

    # 获取名字
    def getName(self, title):
        try:
            name = re.search('《(.*?)》', title).group(1)
        except AttributeError:
            name = title
        return name

    # 获取图片
    def getPic(self, soup, name, vid):
        imageLink = soup.find('img').get('src')
        if imageLink is not None:
            try:
                image = requests.get(imageLink, timeout=1)
                path = 'E:\图片\dy2018\\' + str(vid) + '.jpg'
                with open(path, 'wb') as pic:
                    pic.write(image.content)
            except Exception:
                print('    ' + name + '---图片下载失败')
        else:
            print('    ' + name + '---该视频没有海报')

    # 获取链接
    def getLink(self, soup, vid):
        link = soup.find_all('td')
        links = []
        for l in link:
            try:
                if re.search('word-wrap.*?', l.get('style').lower()) is not None:
                    links.append([l.text.strip(), vid])
            except AttributeError:
                continue
        return links

    # 插入video
    def execute(self, url, cid):
        db = self.getDB()
        cursor = db.cursor()
        soup = self.getSoup(url)
        title = self.getTitle(soup)
        name = self.getName(title)

        cursor.execute(self.selectVideo, title)
        titleDB = cursor.fetchone()

        if titleDB is None:
            cursor.execute(self.insertVideo, (name.strip(), title.strip(), cid))
            vid = cursor.lastrowid

            links = self.getLink(soup, vid)
            if len(links) > 0:
                cursor.executemany(self.insertLink, links)
                self.getPic(soup, name, vid)
            else:
                print('    ' + name + '---无法获取链接')
                db.rollback()
            print('--' + name + '--已完成')
        else:
            print('!!!!!!' + name + '已存在!!!!!!')

        db.commit()
        db.close()

    # 获取分类每一页页面链接
    def getEachVideoLinks(self, cateUrl):
        soup = self.getSoup(cateUrl)

        urls = soup.find_all(attrs={'class': 'ulink'})
        trueUrls = []
        for url in urls:
            trueUrl = url.get('href')
            if re.match('.*?\.html', trueUrl) is not None:
                trueUrls.append(trueUrl)
        return trueUrls

    #     获取分类所有页面链接
    def getEveryVideoLinks(self, cateUrl):
        soup = self.getSoup(cateUrl).text
        pageCount = re.search('页次.*?\d*?/(\d*)', soup).group(1)
        pageNums = ['']
        for i in range(2, int(pageCount) + 1):
            pageNums.append('_' + str(i))

        everyTrueUrls = []
        for num in pageNums:
            url = cateUrl + '/index' + num + '.html'
            try:
                everyTrueUrls += self.getEachVideoLinks(url)
                print(url + '获取页面链接成功')
            except Exception:
                try:
                    everyTrueUrls += self.getEachVideoLinks(url)
                    print(url + '获取页面链接成功')
                except Exception:
                    print('+++++++++++++++重要' + url + '失败+++++++++++++++')
                    continue

        return everyTrueUrls

    #     获取所有分类
    def getCategory(self):
        categorys = []
        for i in range(8, 21):
            categorys.append(str(i))
        categorys.append('html/tv/hytv')
        categorys.append('html/tv/hepai')
        categorys.append('html/tv/gangtai')
        categorys.append('html/tv/oumeitv')
        categorys.append('html/tv/rihantv')
        categorys.append('html/zongyi2013')
        categorys.append('html/2009zongyi')
        categorys.append('html/dongman')
        categorys.append('html/game')
        categorys.append('html/3gp')
        return categorys

    #   获取所有分类的所有页面的所有连接
    def getAllVideoLink(self, categorys):
        timeBegin = time.time()
        for i in range(0, len(categorys)):
            # 获取分类名称
            url = 'https://www.dy2018.com/' + categorys[i]
            try:
                soup = self.getSoup(url)
            except Exception:
                try:
                    soup = self.getSoup(url)
                except Exception:
                    print('+++++++++++++++++重要' + url + '失败+++++++++++++++++++++')
                    continue

            # if i < 20:
            #     titleAll = soup.find('h1').text
            #     categoryTitle = re.search('>(.*?)>', titleAll).group(1).strip()
            # elif i < 25:
            #     titleAll = soup.find('h1').find_all('a')
            #     categoryTitle = titleAll[2].text
            # else:
            titleAll = soup.find('h1').find_all('a')
            categoryTitle = titleAll[1].text

            db = self.getDB()
            cursor = db.cursor()

            cursor.execute(self.selectCategory, categoryTitle.strip())
            ca = cursor.fetchone()
            if ca is None:
                cursor.execute(self.insertCategory, categoryTitle.strip())
            else:
                print(categoryTitle + '   已 存 在')
            cid = cursor.lastrowid
            db.commit()
            db.close()

            try:
                everyUrls = self.getEveryVideoLinks(url)
            except Exception:
                try:
                    everyUrls = self.getEveryVideoLinks(url)
                except Exception:
                    print('++++++++++++++++重要' + url + '失败++++++++++++++++')
                    continue
            timeGetUrls = time.time()
            printTime(timeGetUrls - timeBegin)

            for everyUrl in everyUrls:
                videoUrl = 'https://www.dy2018.com/' + everyUrl
                try:
                    self.execute(videoUrl, cid)
                except Exception:
                    try:
                        self.execute(videoUrl, cid)
                    except Exception as e:
                        print(e)
                        continue
                timeFinishOne = time.time()
                hasFinish=str(everyUrls.index(everyUrl)+1)+' / '+str(len(everyUrls))
                printTime(timeFinishOne - timeBegin,hasFinish)
                # time.sleep(0.7)
            print('-------------------------' + categoryTitle + '已完成----------------------------')


if __name__ == '__main__':
    video = eachVideo()
    categorys = video.getCategory()
    video.getAllVideoLink(categorys)
python爬取电影天堂（www.dy2018.com）所有视屏的所有链接

www.dy2018.com

相关阅读

栏目导航

推荐阅读

热门阅读