tvbox/py/偷乐短剧.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# 偷乐短剧爬虫

import sys
import json
import re
import time
import urllib.parse
import requests
from bs4 import BeautifulSoup

# 导入基础类
sys.path.append('../../')
try:
    from base.spider import Spider
except ImportError:
    # 本地调试时的替代实现
    class Spider:
        def init(self, extend=""):
            pass

class Spider(Spider):
    def __init__(self):
        # 网站主URL
        self.siteUrl = "https://www.toule.top"
        
        # 根据网站实际结构，分类链接格式为: /index.php/vod/show/class/分类名/id/1.html
        # 分类ID映射 - 从网站中提取的分类
        self.cateManual = {
            "男频": "/index.php/vod/show/class/%E7%94%B7%E9%A2%91/id/1.html",
            "女频": "/index.php/vod/show/class/%E5%A5%B3%E9%A2%91/id/1.html",
            "都市": "/index.php/vod/show/class/%E9%83%BD%E5%B8%82/id/1.html",
            "赘婿": "/index.php/vod/show/class/%E8%B5%98%E5%A9%BF/id/1.html",
            "战神": "/index.php/vod/show/class/%E6%88%98%E7%A5%9E/id/1.html",
            "古代言情": "/index.php/vod/show/class/%E5%8F%A4%E4%BB%A3%E8%A8%80%E6%83%85/id/1.html",
            "现代言情": "/index.php/vod/show/class/%E7%8E%B0%E4%BB%A3%E8%A8%80%E6%83%85/id/1.html",
            "历史": "/index.php/vod/show/class/%E5%8E%86%E5%8F%B2/id/1.html",
            "玄幻": "/index.php/vod/show/class/%E7%8E%84%E5%B9%BB/id/1.html",
            "搞笑": "/index.php/vod/show/class/%E6%90%9E%E7%AC%91/id/1.html",
            "甜宠": "/index.php/vod/show/class/%E7%94%9C%E5%AE%A0/id/1.html",
            "励志": "/index.php/vod/show/class/%E5%8A%B1%E5%BF%97/id/1.html",
            "逆袭": "/index.php/vod/show/class/%E9%80%86%E8%A2%AD/id/1.html",
            "穿越": "/index.php/vod/show/class/%E7%A9%BF%E8%B6%8A/id/1.html",
            "古装": "/index.php/vod/show/class/%E5%8F%A4%E8%A3%85/id/1.html"
        }
        
        # 请求头
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            "Referer": "https://www.toule.top/",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
         }
        
        
        # 缓存
        self.cache = {}
        self.cache_timeout = {}
    
    def getName(self):
        return "偷乐短剧"
    
    def init(self, extend=""):
        # 初始化方法，可以留空
        return
    
    def isVideoFormat(self, url):
        """判断是否为视频格式"""
        video_formats = ['.mp4', '.m3u8', '.ts', '.flv', '.avi', '.mkv', '.mov', '.rmvb', '.3gp']
        for format in video_formats:
            if format in url.lower():
                return True
        return False
    
    def manualVideoCheck(self):
        """是否需要手动检查视频"""
        return False
    
    # 工具方法 - 网络请求    
    def fetch(self, url, headers=None, data=None, method="GET"):
        """统一的网络请求方法"""
        try:
            if headers is None:
                headers = self.headers.copy()
                
            if method.upper() == "GET":
                response = requests.get(url, headers=headers, params=data, timeout=10,verify=False)
            else:  # POST
                response = requests.post(url, headers=headers, data=data, timeout=10,verify=False)
                
            response.raise_for_status()
            response.encoding = response.apparent_encoding or 'utf-8'
            return response
        except Exception as e:
            self.log(f"请求失败: {url}, 错误: {str(e)}", "ERROR")
            return None
    
    # 缓存方法
    def getCache(self, key, timeout=3600):
        """获取缓存数据"""
        if key in self.cache and key in self.cache_timeout:
            if time.time() < self.cache_timeout[key]:
                return self.cache[key]
            else:
                del self.cache[key]
                del self.cache_timeout[key]
        return None
    
    def setCache(self, key, value, timeout=3600):
        """设置缓存数据"""
        self.cache[key] = value
        self.cache_timeout[key] = time.time() + timeout
    
    # 日志方法
    def log(self, msg, level='INFO'):
        """记录日志"""
        levels = {
            'DEBUG': 0,
            'INFO': 1,
            'WARNING': 2,
            'ERROR': 3
        }
        
        current_level = 'INFO'  # 可以设置为DEBUG以获取更多信息
        
        if levels.get(level, 4) >= levels.get(current_level, 1):
            print(f"[{level}] {time.strftime('%Y-%m-%d %H:%M:%S')} - {msg}")
    
    # 辅助方法 - 从URL中提取视频ID
    def extractVodId(self, url):
        """从URL中提取视频ID"""
        # 路径格式: /index.php/vod/play/id/9024/sid/1/nid/1.html
        match = re.search(r'/id/(\d+)/', url)
        if match:
            return match.group(1)
        return ""

    # 辅助方法 - 从网页内容中提取分类
    def extractCategories(self, text):
        """从网页内容中提取分类标签"""
        cats = []
        # 匹配标签字符串，例如: "男频,逆袭,亲情,短剧"
        if "," in text:
            parts = text.split(",")
            for part in parts:
                part = part.strip()
                if part and part != "短剧":
                    cats.append(part)
        return cats
    
    # 主要接口实现
    def homeContent(self, filter):
        """获取首页分类及内容"""
        result = {}
        classes = []
        
        # 从缓存获取
        cache_key = 'home_classes'
        cached_classes = self.getCache(cache_key)
        if cached_classes:
            classes = cached_classes
        else:
            # 使用预定义的分类
            for k, v in self.cateManual.items():
                classes.append({
                    'type_id': v,  # 使用完整URL路径作为type_id
                    'type_name': k
                })
            
            # 保存到缓存
            self.setCache(cache_key, classes, 24*3600)  # 缓存24小时
        
        result['class'] = classes
        
        # 获取首页推荐视频
        videos = self.homeVideoContent().get('list', [])
        result['list'] = videos
        
        return result
    
    def homeVideoContent(self):
        """获取首页推荐视频内容"""
        result = {'list': []}
        videos = []
        
        # 从缓存获取
        cache_key = 'home_videos'
        cached_videos = self.getCache(cache_key)
        if cached_videos:
            return {'list': cached_videos}
        
        try:
            response = self.fetch(self.siteUrl)
            if response and response.status_code == 200:
                html = response.text
                soup = BeautifulSoup(html, 'html.parser')
                
                # 查找最新更新区域
                latest_section = soup.find('h2', text=lambda t: t and '最新更新' in t)
                if latest_section:
                    container = latest_section.parent  # 获取容器
                    if container:
                        # 查找所有 li.item 元素
                        items = container.find_all('li', class_='item')
                        
                        for item in items:
                            try:
                                # 获取链接和标题
                                title_link = item.find('h3')
                                if not title_link:
                                    continue
                                
                                title = title_link.text.strip()
                                
                                # 获取第一个链接作为详情页链接
                                link_tag = item.find('a')
                                if not link_tag:
                                    continue
                                
                                link = link_tag.get('href', '')
                                if not link.startswith('http'):
                                    link = urllib.parse.urljoin(self.siteUrl, link)
                                
                                # 提取ID
                                vid = self.extractVodId(link)
                                if not vid:
                                    continue
                                
                                # 获取图片
                                img_tag = item.find('img')
                                img_url = ""
                                if img_tag:
                                    img_url = img_tag.get('src', img_tag.get('data-src', ''))
                                    if img_url and not img_url.startswith('http'):
                                        img_url = urllib.parse.urljoin(self.siteUrl, img_url)
                                
                                # 获取备注信息
                                remarks = ""
                                remarks_tag = item.find('span', class_='remarks')
                                if remarks_tag:
                                    remarks = remarks_tag.text.strip()
                                
                                # 获取标签信息
                                tags = ""
                                tags_tag = item.find('span', class_='tags')
                                if tags_tag:
                                    tags = tags_tag.text.strip()
                                
                                # 合并备注和标签
                                if remarks and tags:
                                    remarks = f"{remarks} | {tags}"
                                elif tags:
                                    remarks = tags
                                
                                # 构建视频项
                                videos.append({
                                    'vod_id': vid,
                                    'vod_name': title,
                                    'vod_pic': img_url,
                                    'vod_remarks': remarks
                                })
                            except Exception as e:
                                self.log(f"处理视频项时出错: {str(e)}", "ERROR")
                                continue
                
                # 保存到缓存
                self.setCache(cache_key, videos, 3600)  # 缓存1小时
        except Exception as e:
            self.log(f"获取首页视频内容发生错误: {str(e)}", "ERROR")
        
        result['list'] = videos
        return result
    
    def categoryContent(self, tid, pg, filter, extend):
        """获取分类内容"""
        result = {}
        videos = []
        
        # 处理页码
        if pg is None:
            pg = 1
        else:
            pg = int(pg)

        # 构建分类URL - tid是完整的URL路径
        if tid.startswith("/"):
            # 替换页码，URL格式可能像: /index.php/vod/show/class/男频/id/1.html
            if pg > 1:
                if "html" in tid:
                    category_url = tid.replace(".html", f"/page/{pg}.html")
                else:
                    category_url = f"{tid}/page/{pg}.html"
            else:
                category_url = tid
            
            full_url = urllib.parse.urljoin(self.siteUrl, category_url)
        else:
            # 如果tid不是URL路径，可能是旧版分类ID，尝试查找对应URL
            category_url = ""
            for name, url in self.cateManual.items():
                if name == tid:
                    category_url = url
                    break
            
            if not category_url:
                self.log(f"未找到分类ID对应的URL: {tid}", "ERROR")
                result['list'] = []
                result['page'] = pg
                result['pagecount'] = 1
                result['limit'] = 0
                result['total'] = 0
                return result
                
            # 处理页码
            if pg > 1:
                if "html" in category_url:
                    category_url = category_url.replace(".html", f"/page/{pg}.html")
                else:
                    category_url = f"{category_url}/page/{pg}.html"
            
            full_url = urllib.parse.urljoin(self.siteUrl, category_url)
        
        # 请求分类页
        try:
            response = self.fetch(full_url)
            if response and response.status_code == 200:
                html = response.text
                soup = BeautifulSoup(html, 'html.parser')
                
                # 查找视频项，根据实际HTML结构调整
                items = soup.find_all('li', class_='item')
                
                for item in items:
                    try:
                        # 获取链接和标题
                        title_tag = item.find('h3')
                        if not title_tag:
                            continue
                        
                        title = title_tag.text.strip()
                        
                        # 获取链接
                        link_tag = item.find('a')
                        if not link_tag:
                            continue
                        
                        link = link_tag.get('href', '')
                        if not link.startswith('http'):
                            link = urllib.parse.urljoin(self.siteUrl, link)
                        
                        # 提取ID
                        vid = self.extractVodId(link)
                        if not vid:
                            continue
                        
                        # 获取图片
                        img_tag = item.find('img')
                        img_url = ""
                        if img_tag:
                            img_url = img_tag.get('src', img_tag.get('data-src', ''))
                            if img_url and not img_url.startswith('http'):
                                img_url = urllib.parse.urljoin(self.siteUrl, img_url)
                        
                        # 获取备注信息
                        remarks = ""
                        remarks_tag = item.find('span', class_='remarks')
                        if remarks_tag:
                            remarks = remarks_tag.text.strip()
                        
                        # 获取标签信息
                        tags = ""
                        tags_tag = item.find('span', class_='tags')
                        if tags_tag:
                            tags = tags_tag.text.strip()
                        
                        # 合并备注和标签
                        if remarks and tags:
                            remarks = f"{remarks} | {tags}"
                        elif tags:
                            remarks = tags
                        
                        # 构建视频项
                        videos.append({
                            'vod_id': vid,
                            'vod_name': title,
                            'vod_pic': img_url,
                            'vod_remarks': remarks
                        })
                    except Exception as e:
                        self.log(f"处理分类视频项时出错: {str(e)}", "ERROR")
                        continue
                
                # 查找分页信息
                # 默认值
                total = len(videos)
                pagecount = 1
                limit = 20
                
                # 尝试查找分页元素
                pagination = soup.find('ul', class_='page')
                if pagination:
                    # 查找最后一页的链接
                    last_page_links = pagination.find_all('a')
                    for link in last_page_links:
                        page_text = link.text.strip()
                        if page_text.isdigit():
                            pagecount = max(pagecount, int(page_text))
        except Exception as e:
            self.log(f"获取分类内容发生错误: {str(e)}", "ERROR")
        
        result['list'] = videos
        result['page'] = pg
        result['pagecount'] = pagecount
        result['limit'] = limit
        result['total'] = total
        
        return result
    
    def detailContent(self, ids):
        """获取详情内容"""
        result = {}
        
        if not ids or len(ids) == 0:
            return result
            
        # 视频ID
        vid = ids[0]
        
        # 构建播放页URL
        play_url = f"{self.siteUrl}/index.php/vod/play/id/{vid}/sid/1/nid/1.html"
        
        try:
            response = self.fetch(play_url)
            if not response or response.status_code != 200:
                return result
                
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            
            # 提取视频基本信息
            # 标题
            title = ""
            title_tag = soup.find('h1', class_='items-title')
            if title_tag:
                title = title_tag.text.strip()
            
            # 图片
            pic = ""
            pic_tag = soup.find('img', class_='thumb')
            if pic_tag:
                pic = pic_tag.get('src', '')
                if pic and not pic.startswith('http'):
                    pic = urllib.parse.urljoin(self.siteUrl, pic)
            
            # 简介
            desc = ""
            desc_tag = soup.find('div', class_='text-content')
            if desc_tag:
                desc = desc_tag.text.strip()
            
            # 标签/分类
            tags = []
            tags_container = soup.find('span', class_='items-tags')
            if tags_container:
                tag_links = tags_container.find_all('a')
                for tag in tag_links:
                    tag_text = tag.text.strip()
                    if tag_text:
                        tags.append(tag_text)
            
            # 提取播放列表
            play_from = "偷乐短剧"
            play_list = []
            
            # 查找播放列表区域
            play_area = soup.find('div', class_='swiper-wrapper')
            if play_area:
                # 查找所有剧集链接
                episode_links = play_area.find_all('a')
                for ep in episode_links:
                    ep_title = ep.text.strip()
                    ep_url = ep.get('href', '')
                    
                    if ep_url:
                        # 直接使用URL作为ID
                        if not ep_url.startswith('http'):
                            ep_url = urllib.parse.urljoin(self.siteUrl, ep_url)
                        
                        # 提取集数信息
                        ep_num = ep_title
                        if ep_num.isdigit():
                            ep_num = f"第{ep_num}集"
                        
                        play_list.append(f"{ep_num}${ep_url}")
            
            # 如果没有找到播放列表，查找播放按钮
            if not play_list:
                play_btn = soup.find('a', class_='btn-play')
                if play_btn:
                    play_url = play_btn.get('href', '')
                    if play_url:
                        if not play_url.startswith('http'):
                            play_url = urllib.parse.urljoin(self.siteUrl, play_url)
                        
                        play_list.append(f"播放${play_url}")
            
            # 如果仍然没有找到播放链接，使用播放页URL
            if not play_list:
                play_url = f"{self.siteUrl}/index.php/vod/play/id/{vid}/sid/1/nid/1.html"
                play_list.append(f"播放${play_url}")
            
            # 提取更多信息（导演、演员等）
            director = ""
            actor = ""
            year = ""
            area = ""
            remarks = ""
            
            # 查找备注信息
            meta_items = soup.find_all('div', class_='meta-item')
            for item in meta_items:
                item_title = item.find('span', class_='item-title')
                item_content = item.find('span', class_='item-content')
                
                if item_title and item_content:
                    title_text = item_title.text.strip()
                    content_text = item_content.text.strip()
                    
                    if "导演" in title_text:
                        director = content_text
                    elif "主演" in title_text:
                        actor = content_text
                    elif "年份" in title_text:
                        year = content_text
                    elif "地区" in title_text:
                        area = content_text
                    elif "简介" in title_text:
                        if not desc:
                            desc = content_text
                    elif "状态" in title_text:
                        remarks = content_text
            
            # 如果没有从meta-item中获取到remarks
            if not remarks:
                remarks_tag = soup.find('span', class_='remarks')
                if remarks_tag:
                    remarks = remarks_tag.text.strip()
            
            # 构建标准数据结构
            vod = {
                "vod_id": vid,
                "vod_name": title,
                "vod_pic": pic,
                "vod_year": year,
                "vod_area": area,
                "vod_remarks": remarks,
                "vod_actor": actor,
                "vod_director": director,
                "vod_content": desc,
                "type_name": ",".join(tags),
                "vod_play_from": play_from,
                "vod_play_url": "#".join(play_list)
            }
            
            result = {
                'list': [vod]
            }
        except Exception as e:
            self.log(f"获取详情内容时出错: {str(e)}", "ERROR")
        
        return result
    
    def searchContent(self, key, quick, pg=1):
        """搜索功能"""
        result = {}
        videos = []
        
        # 构建搜索URL和参数
        search_url = f"{self.siteUrl}/index.php/vod/search.html"
        params = {"wd": key}
        
        try:
            response = self.fetch(search_url, data=params)
            if response and response.status_code == 200:
                html = response.text
                soup = BeautifulSoup(html, 'html.parser')
                
                # 查找搜索结果项
                search_items = soup.find_all('li', class_='item')
                
                for item in search_items:
                    try:
                        # 获取标题
                        title_tag = item.find('h3')
                        if not title_tag:
                            continue
                        
                        title = title_tag.text.strip()
                        
                        # 获取链接
                        link_tag = item.find('a')
                        if not link_tag:
                            continue
                        
                        link = link_tag.get('href', '')
                        if not link.startswith('http'):
                            link = urllib.parse.urljoin(self.siteUrl, link)
                        
                        # 提取视频ID
                        vid = self.extractVodId(link)
                        if not vid:
                            continue
                        
                        # 获取图片
                        img_tag = item.find('img')
                        img_url = ""
                        if img_tag:
                            img_url = img_tag.get('src', img_tag.get('data-src', ''))
                            if img_url and not img_url.startswith('http'):
                                img_url = urllib.parse.urljoin(self.siteUrl, img_url)
                        
                        # 获取备注信息
                        remarks = ""
                        remarks_tag = item.find('span', class_='remarks')
                        if remarks_tag:
                            remarks = remarks_tag.text.strip()
                        
                        # 获取标签信息
                        tags = ""
                        tags_tag = item.find('span', class_='tags')
                        if tags_tag:
                            tags = tags_tag.text.strip()
                        
                        # 合并备注和标签
                        if remarks and tags:
                            remarks = f"{remarks} | {tags}"
                        elif tags:
                            remarks = tags
                        
                        # 构建视频项
                        videos.append({
                            'vod_id': vid,
                            'vod_name': title,
                            'vod_pic': img_url,
                            'vod_remarks': remarks
                        })
                    except Exception as e:
                        self.log(f"处理搜索结果时出错: {str(e)}", "ERROR")
                        continue
        except Exception as e:
            self.log(f"搜索功能发生错误: {str(e)}", "ERROR")
        
        result['list'] = videos
        return result
    
    def searchContentPage(self, key, quick, pg=1):
        return self.searchContent(key, quick, pg)
    
    def playerContent(self, flag, id, vipFlags):
        """获取播放内容"""
        result = {}
        
        try:
            # 判断是否已经是视频URL
            if self.isVideoFormat(id):
                result["parse"] = 0
                result["url"] = id
                result["playUrl"] = ""
                result["header"] = json.dumps(self.headers)
                return result
            
            # 判断是否是完整的页面URL
            if id.startswith(('http://', 'https://')):
                play_url = id
            # 尝试作为相对路径处理
            elif id.startswith('/'):
                play_url = urllib.parse.urljoin(self.siteUrl, id)
            # 假设是视频ID，构建播放页面URL
            else:
                # 检查是否是"视频ID_集数"格式
                parts = id.split('_')
                if len(parts) > 1 and parts[0].isdigit():
                    vid = parts[0]
                    nid = parts[1]
                    play_url = f"{self.siteUrl}/index.php/vod/play/id/{vid}/sid/1/nid/{nid}.html"
                else:
                    # 直接当作视频ID处理
                    play_url = f"{self.siteUrl}/index.php/vod/play/id/{id}/sid/1/nid/1.html"
            
            # 访问播放页获取真实播放地址
            try:
                self.log(f"正在解析播放页面: {play_url}")
                response = self.fetch(play_url)
                if response and response.status_code == 200:
                    html = response.text
                    
                    # 查找player_aaaa变量
                    player_match = re.search(r'var\s+player_aaaa\s*=\s*({.*?});', html, re.DOTALL)
                    if player_match:
                        try:
                            player_data = json.loads(player_match.group(1))
                            if 'url' in player_data:
                                video_url = player_data['url']
                                if not video_url.startswith('http'):
                                    video_url = urllib.parse.urljoin(self.siteUrl, video_url)
                                
                                self.log(f"从player_aaaa获取到视频地址: {video_url}")
                                result["parse"] = 0
                                result["url"] = video_url
                                result["playUrl"] = ""
                                result["header"] = json.dumps(self.headers)
                                return result
                        except json.JSONDecodeError as e:
                            self.log(f"解析player_aaaa JSON出错: {str(e)}", "ERROR")
                    
                    # 如果player_aaaa解析失败，尝试其他方式
                    # 1. 查找video标签
                    video_match = re.search(r'<video[^>]*src=["\'](.*?)["\']', html)
                    if video_match:
                        video_url = video_match.group(1)
                        if not video_url.startswith('http'):
                            video_url = urllib.parse.urljoin(self.siteUrl, video_url)
                        
                        self.log(f"从video标签找到视频地址: {video_url}")
                        result["parse"] = 0
                        result["url"] = video_url
                        result["playUrl"] = ""
                        result["header"] = json.dumps(self.headers)
                        return result
                    
                    # 2. 查找iframe
                    iframe_match = re.search(r'<iframe[^>]*src=["\'](.*?)["\']', html)
                    if iframe_match:
                        iframe_url = iframe_match.group(1)
                        if not iframe_url.startswith('http'):
                            iframe_url = urllib.parse.urljoin(self.siteUrl, iframe_url)
                        
                        self.log(f"找到iframe，正在解析: {iframe_url}")
                        # 访问iframe内容
                        iframe_response = self.fetch(iframe_url)
                        if iframe_response and iframe_response.status_code == 200:
                            iframe_html = iframe_response.text
                            
                            # 在iframe内容中查找视频地址
                            iframe_video_match = re.search(r'(https?://[^\'"]+\.(mp4|m3u8|ts))', iframe_html)
                            if iframe_video_match:
                                video_url = iframe_video_match.group(1)
                                
                                self.log(f"从iframe中找到视频地址: {video_url}")
                                result["parse"] = 0
                                result["url"] = video_url
                                result["playUrl"] = ""
                                result["header"] = json.dumps({
                                    "User-Agent": self.headers["User-Agent"],
                                    "Referer": iframe_url
                                })
                                return result
                    
                    # 3. 查找任何可能的视频URL
                    url_match = re.search(r'(https?://[^\'"]+\.(mp4|m3u8|ts))', html)
                    if url_match:
                        video_url = url_match.group(1)
                        
                        self.log(f"找到可能的视频地址: {video_url}")
                        result["parse"] = 0
                        result["url"] = video_url
                        result["playUrl"] = ""
                        result["header"] = json.dumps(self.headers)
                        return result
            except Exception as e:
                self.log(f"解析播放地址时出错: {str(e)}", "ERROR")
            
            # 如果所有方式都失败，返回外部解析标志
            self.log("未找到直接可用的视频地址，需要外部解析", "WARNING")
            result["parse"] = 1  # 表示需要外部解析
            result["url"] = play_url  # 返回播放页面URL
            result["playUrl"] = ""
            result["header"] = json.dumps(self.headers)
            
        except Exception as e:
            self.log(f"获取播放内容时出错: {str(e)}", "ERROR")
        
        return result
    
    def localProxy(self, param):
        """本地代理"""
        return [404, "text/plain", {}, "Not Found"]