N3RDN/JN/dr_py/js/cntv央视.py
2024-07-03 20:36:08 +08:00

1057 lines
46 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
# !/usr/bin/python
import os.path
import random
import sys
sys.path.append('..')
try:
# from base.spider import Spider as BaseSpider
from base.spider import BaseSpider
except ImportError:
from t4.base.spider import BaseSpider
import json
import time
import base64
import datetime
import re
from urllib import request, parse
from pathlib import Path
import urllib
import urllib.request
"""
配置示例:
t4的配置里ext节点会自动变成api对应query参数extend,但t4的ext字符串不支持路径格式比如./开头或者.json结尾
api里会自动含有ext参数是base64编码后的选中的筛选条件
错误示例,ext含有json:
{
"key":"hipy_cntv央视",
"name":"cntv央视(hipy_t4)",
"type":4,
"api":"http://192.168.31.49:5707/api/v1/vod/cntv央视?api_ext={{host}}/txt/hipy/cntv央视.json",
"searchable":1,
"quickSearch":1,
"filterable":0,
"ext":"cntv央视.json"
}
正确示例。同时存在ext和api_ext会优先取ext作为extend加载init
{
"key":"hipy_t4_cntv央视",
"name":"cntv央视(hipy_t4)",
"type":4,
"api":"http://192.168.31.49:5707/api/v1/vod/cntv央视?api_ext={{host}}/txt/hipy/cntv央视.json",
"searchable":1,
"quickSearch":0,
"filterable":1,
"ext":"{{host}}/files/hipy/cntv央视.json"
},
{
"key": "hipy_t3_cntv央视",
"name": "cntv央视(hipy_t3)",
"type": 3,
"api": "{{host}}/txt/hipy/cntv央视.py",
"searchable": 1,
"quickSearch": 0,
"filterable": 1,
"ext": "{{host}}/files/hipy/cntv央视.json"
},
"""
class Spider(BaseSpider): # 元类 默认的元类 type
module = None
def getDependence(self):
return ['base_spider']
def getName(self):
return "中央电视台" # 可搜索
def init_api_ext_file(self):
ext_file = __file__.replace('.py', '.json')
print(f'ext_file:{ext_file}')
# 特别节目网页: https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65
# 特别节目分类筛选获取页面: https://tv.cctv.com/yxg/tbjm/index.shtml
# 纪录片网页: https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65
# 纪录片分类筛选获取页面:https://tv.cctv.com/yxg/jlp/index.shtml
# ==================== 获取特别节目的筛选条件 ======================
r = self.fetch('https://tv.cctv.com/yxg/tbjm/index.shtml')
html = r.text
html = self.html(html)
filter_tbjm = []
lis = html.xpath('//*[@id="pindao"]/li')
li_value = []
for li in lis:
li_value.append({
'n': ''.join(li.xpath('./span//text()')),
'v': ''.join(li.xpath('@datacd')),
})
# print(li_value)
filter_tbjm.append({
"key": "datapd-channel",
"name": "频道",
"value": li_value
})
lis = html.xpath('//*[@id="fenlei"]/li')
li_value = []
for li in lis:
li_value.append({
'n': ''.join(li.xpath('./span//text()')),
'v': ''.join(li.xpath('@datalx')),
})
# print(li_value)
filter_tbjm.append({
"key": "datafl-sc",
"name": "类型",
"value": li_value
})
lis = html.xpath('//*[@id="zimu"]/li')
li_value = []
for li in lis:
li_value.append({
'n': ''.join(li.xpath('./span//text()')),
'v': ''.join(li.xpath('@datazm')),
})
# print(li_value)
filter_tbjm.append({
"key": "dataszm-letter",
"name": "首字母",
"value": li_value
})
print(filter_tbjm)
# ==================== 纪录片筛选获取 ======================
r = self.fetch('https://tv.cctv.com/yxg/jlp/index.shtml')
html = r.text
html = self.html(html)
filter_jlp = []
lis = html.xpath('//*[@id="pindao"]/li')
li_value = []
for li in lis:
li_value.append({
'n': ''.join(li.xpath('./span//text()')),
'v': ''.join(li.xpath('@datacd')),
})
# print(li_value)
filter_jlp.append({
"key": "datapd-channel",
"name": "频道",
"value": li_value
})
lis = html.xpath('//*[@id="fenlei"]/li')
li_value = []
for li in lis:
li_value.append({
'n': ''.join(li.xpath('./span//text()')),
'v': ''.join(li.xpath('@datalx')),
})
# print(li_value)
filter_jlp.append({
"key": "datafl-sc",
"name": "类型",
"value": li_value
})
lis = html.xpath('//*[@id="nianfen"]/li')
li_value = []
for li in lis:
li_value.append({
'n': ''.join(li.xpath('./span//text()')),
'v': ''.join(li.xpath('@datanf')),
})
# print(li_value)
filter_jlp.append({
"key": "datanf-year",
"name": "年份",
"value": li_value
})
lis = html.xpath('//*[@id="zimu"]/li')
li_value = []
for li in lis:
li_value.append({
'n': ''.join(li.xpath('./span//text()')),
'v': ''.join(li.xpath('@datazm')),
})
# print(li_value)
filter_jlp.append({
"key": "dataszm-letter",
"name": "首字母",
"value": li_value
})
print(filter_jlp)
ext_file_dict = {
"特别节目": filter_tbjm,
"纪录片": filter_jlp,
}
# print(json.dumps(ext_file_dict,ensure_ascii=False,indent=4))
with open(ext_file, mode='w+', encoding='utf-8') as f:
# f.write(json.dumps(ext_file_dict,ensure_ascii=False,indent=4))
f.write(json.dumps(ext_file_dict, ensure_ascii=False))
def init(self, extend=""):
def init_file(ext_file):
ext_file = Path(ext_file).as_posix()
# print(f'ext_file:{ext_file}')
if os.path.exists(ext_file):
# print('存在扩展文件')
with open(ext_file, mode='r', encoding='utf-8') as f:
try:
ext_dict = json.loads(f.read())
# print(ext_dict)
self.config['filter'].update(ext_dict)
except Exception as e:
print(f'更新扩展筛选条件发生错误:{e}')
print("============依赖列表:{0}============".format(extend))
ext = self.extend
print("============ext:{0}============".format(ext))
if isinstance(ext, str) and ext:
if ext.startswith('./'):
ext_file = os.path.join(os.path.dirname(__file__), ext)
init_file(ext_file)
elif ext.startswith('http'):
try:
r = self.fetch(ext)
self.config['filter'].update(r.json())
except Exception as e:
print(f'更新扩展筛选条件发生错误:{e}')
elif not ext.startswith('./') and not ext.startswith('http'):
ext_file = os.path.join(os.path.dirname(__file__), './' + ext + '.json')
init_file(ext_file)
# ==================== 栏目大全加载年月筛选 ======================
lanmu_list = self.config['filter']['栏目大全']
lanmu_keys_list = [lanmu['key'] for lanmu in lanmu_list]
if 'year' not in lanmu_keys_list:
currentYear = datetime.date.today().year
yearList = [{"n": "全部", "v": ""}]
for year in range(currentYear, currentYear - 10, -1):
yearList.append({"n": year, "v": year})
yearDict = {"key": "year", "name": "年份", "value": yearList}
lanmu_list.append(yearDict)
if 'month' not in lanmu_keys_list:
monthList = [{"n": "全部", "v": ""}]
for month in range(1, 13):
text = str(month).rjust(2, '0')
monthList.append({"n": text, "v": text})
monthDict = {"key": "month", "name": "月份", "value": monthList}
lanmu_list.append(monthDict)
# 装载模块,这里只要一个就够了
if isinstance(extend, list):
for lib in extend:
if '.Spider' in str(type(lib)):
self.module = lib
break
def isVideoFormat(self, url):
pass
def manualVideoCheck(self):
pass
def homeContent(self, filter):
result = {}
cateManual = {
"4K专区": "4K专区",
"栏目大全": "栏目大全",
"特别节目": "特别节目",
"纪录片": "纪录片",
"电视剧": "电视剧",
"动画片": "动画片",
"频道直播": "频道直播",
}
classes = []
for k in cateManual:
classes.append({
'type_name': k,
'type_id': cateManual[k]
})
result['class'] = classes
if (filter):
result['filters'] = self.config['filter']
return result
def homeVideoContent(self):
result = {
'list': []
}
if self.module:
result = self.module.homeVideoContent()
return result
def categoryContent(self, tid, pg, filter, extend):
result = {}
month = "" # 月
year = "" # 年
area = '' # 地区
channel = '' # 频道
datafl = '' # 类型
letter = '' # 字母
year_prefix = '' # 栏目大全的年月筛选过滤
pagecount = 24
if tid == '动画片':
id = urllib.parse.quote(tid)
if 'datadq-area' in extend.keys():
area = urllib.parse.quote(extend['datadq-area'])
if 'dataszm-letter' in extend.keys():
letter = extend['dataszm-letter']
if 'datafl-sc' in extend.keys():
datafl = urllib.parse.quote(extend['datafl-sc'])
url = 'https://api.cntv.cn/list/getVideoAlbumList?channelid=CHAL1460955899450127&area={0}&sc={4}&fc={1}&letter={2}&p={3}&n=24&serviceId=tvcctv&topv=1&t=json'.format(
area, id, letter, pg, datafl)
elif tid == '纪录片':
id = urllib.parse.quote(tid)
if 'datapd-channel' in extend.keys():
channel = urllib.parse.quote(extend['datapd-channel'])
if 'datafl-sc' in extend.keys():
datafl = urllib.parse.quote(extend['datafl-sc'])
if 'datanf-year' in extend.keys():
year = extend['datanf-year']
if 'dataszm-letter' in extend.keys():
letter = extend['dataszm-letter']
url = 'https://api.cntv.cn/list/getVideoAlbumList?channelid=CHAL1460955924871139&fc={0}&channel={1}&sc={2}&year={3}&letter={4}&p={5}&n=24&serviceId=tvcctv&topv=1&t=json'.format(
id, channel, datafl, year, letter, pg)
elif tid == '电视剧':
id = urllib.parse.quote(tid)
if 'datafl-sc' in extend.keys():
datafl = urllib.parse.quote(extend['datafl-sc'])
if 'datanf-year' in extend.keys():
year = extend['datanf-year']
if 'dataszm-letter' in extend.keys():
letter = extend['dataszm-letter']
url = 'https://api.cntv.cn/list/getVideoAlbumList?channelid=CHAL1460955853485115&area={0}&sc={1}&fc={2}&year={3}&letter={4}&p={5}&n=24&serviceId=tvcctv&topv=1&t=json'.format(
area, datafl, id, year, letter, pg)
elif tid == '特别节目':
id = urllib.parse.quote(tid)
if 'datapd-channel' in extend.keys():
channel = urllib.parse.quote(extend['datapd-channel'])
if 'datafl-sc' in extend.keys():
datafl = urllib.parse.quote(extend['datafl-sc'])
if 'dataszm-letter' in extend.keys():
letter = extend['dataszm-letter']
url = 'https://api.cntv.cn/list/getVideoAlbumList?channelid=CHAL1460955953877151&channel={0}&sc={1}&fc={2}&bigday=&letter={3}&p={4}&n=24&serviceId=tvcctv&topv=1&t=json'.format(
channel, datafl, id, letter, pg)
elif tid == '栏目大全':
cid = '' # 频道
if 'cid' in extend.keys():
cid = extend['cid']
fc = '' # 分类
if 'fc' in extend.keys():
fc = extend['fc']
fl = '' # 字母
if 'fl' in extend.keys():
fl = extend['fl']
year = extend.get('year') or ''
month = extend.get('month') or ''
if year:
year_prefix = year + month
url = 'https://api.cntv.cn/lanmu/columnSearch?&fl={0}&fc={1}&cid={2}&p={3}&n=20&serviceId=tvcctv&t=json&cb=ko'.format(
fl, fc, cid, pg)
pagecount = 20
elif tid == '4K专区':
cid = 'CHAL1558416868484111'
url = 'https://api.cntv.cn/NewVideo/getLastVideoList4K?serviceId=cctv4k&cid={0}&p={1}&n={2}&t=json&cb=ko'.format(
cid, pg, pagecount
)
elif tid == '频道直播':
url = 'https://tv.cctv.com/epg/index.shtml'
else:
url = 'https://tv.cctv.com/epg/index.shtml'
videos = []
htmlText = self.fetch(url).text
if tid == '栏目大全':
index = htmlText.rfind(');')
if index > -1:
htmlText = htmlText[3:index]
videos = self.get_list1(html=htmlText, tid=tid, year_prefix=year_prefix)
elif tid == '4K专区':
index = htmlText.rfind(');')
if index > -1:
htmlText = htmlText[3:index]
videos = self.get_list_4k(html=htmlText, tid=tid)
elif tid == '频道直播':
html = self.html(htmlText)
lis = html.xpath('//*[@id="jiemudan01"]//div[contains(@class,"channel_con")]//ul/li')
for li in lis:
vid = ''.join(li.xpath('./img/@title'))
pic = ''.join(li.xpath('./img/@src'))
pic = self.urljoin('https://tv.cctv.com/epg/index.shtml', pic)
videos.append({
'vod_id': '||'.join([tid, vid, f'https://tv.cctv.com/live/{vid}/', pic]),
'vod_name': vid,
'vod_pic': pic,
'vod_mark': '',
})
else:
videos = self.get_list(html=htmlText, tid=tid)
# print(videos)
result['list'] = videos
result['page'] = pg
result['pagecount'] = 9999 if len(videos) >= pagecount else pg
result['limit'] = 90
result['total'] = 999999
return result
def detailContent(self, array):
result = {}
year_prefix = ''
did = array[0]
if '$$$' in did:
year_prefix = did.split('$$$')[0]
did = did.split('$$$')[1]
aid = did.split('||')
tid = aid[0]
title = aid[1]
lastVideo = aid[2]
logo = aid[3]
if tid == '频道直播':
vod = {
"vod_id": did,
"vod_name": title.replace(' ', ''),
"vod_pic": logo,
"vod_content": f'频道{title}正在直播中',
"vod_play_from": '道长在线直播',
"vod_play_url": f'在线观看${title}||{lastVideo}',
}
result = {'list': [vod]}
return result
id = aid[4]
vod_year = aid[5]
actors = aid[6] if len(aid) > 6 else ''
brief = aid[7] if len(aid) > 7 else '' # get请求最长255这个描述会有可能直接被干没了。
fromId = 'CCTV'
if tid == "栏目大全":
lastUrl = 'https://api.cntv.cn/video/videoinfoByGuid?guid={0}&serviceId=tvcctv'.format(id)
# htmlTxt = self.webReadFile(urlStr=lastUrl, header=self.header)
htmlTxt = self.fetch(lastUrl).text
topicId = json.loads(htmlTxt)['ctid']
url = 'https://api.cntv.cn/NewVideo/getVideoListByColumn'
# params = {
# 'p': '1',
# 'n': '100',
# 't': 'json',
# 'mode': '0',
# 'sort': 'desc',
# 'serviceId': 'tvcctv',
# 'd': year_prefix,
# 'id': topicId
# }
# htmlTxt = self.fetch(url,data=params).text
Url = "{0}?id={1}&d=&p=1&n=100&sort=desc&mode=0&serviceId=tvcctv&t=json&d={2}".format(
url, topicId, year_prefix)
elif tid == "4K专区":
Url = 'https://api.cntv.cn/NewVideo/getVideoListByAlbumIdNew?id={0}&serviceId=cctv4k&p=1&n=100&mode=0&pub=1'.format(
id)
print(Url)
else:
Url = 'https://api.cntv.cn/NewVideo/getVideoListByAlbumIdNew?id={0}&serviceId=tvcctv&p=1&n=100&mode=0&pub=1'.format(
id)
jRoot = ''
videoList = []
try:
if tid == "搜索":
fromId = '中央台'
videoList = [title + "$" + lastVideo]
else:
# htmlTxt = self.webReadFile(urlStr=Url, header=self.header)
htmlTxt = self.fetch(Url).text
jRoot = json.loads(htmlTxt)
data = jRoot['data']
jsonList = data['list']
videoList = self.get_EpisodesList(jsonList=jsonList)
if len(videoList) < 1:
# htmlTxt = self.webReadFile(urlStr=lastVideo, header=self.header)
htmlTxt = self.fetch(lastVideo).text
if tid == "电视剧" or tid == "纪录片" or tid == "4K专区":
patternTxt = r"'title':\s*'(?P<title>.+?)',\n{0,1}\s*'brief':\s*'(.+?)',\n{0,1}\s*'img':\s*'(.+?)',\n{0,1}\s*'url':\s*'(?P<url>.+?)'"
elif tid == "特别节目":
patternTxt = r'class="tp1"><a\s*href="(?P<url>https://.+?)"\s*target="_blank"\s*title="(?P<title>.+?)"></a></div>'
elif tid == "动画片":
patternTxt = r"'title':\s*'(?P<title>.+?)',\n{0,1}\s*'img':\s*'(.+?)',\n{0,1}\s*'brief':\s*'(.+?)',\n{0,1}\s*'url':\s*'(?P<url>.+?)'"
elif tid == "栏目大全":
patternTxt = r'href="(?P<url>.+?)" target="_blank" alt="(?P<title>.+?)" title=".+?">'
videoList = self.get_EpisodesList_re(htmlTxt=htmlTxt, patternTxt=patternTxt)
fromId = '央视'
except:
pass
if len(videoList) == 0:
return {}
vod = {
"vod_id": did,
"vod_name": title.replace(' ', ''),
"vod_pic": logo,
"type_name": tid,
"vod_year": vod_year,
"vod_area": "",
"vod_remarks": '',
"vod_actor": actors,
"vod_director": '',
"vod_content": brief
}
vod['vod_play_from'] = fromId
vod['vod_play_url'] = "#".join(videoList)
result = {
'list': [
vod
]
}
return result
def get_lineList(self, Txt, mark, after):
circuit = []
origin = Txt.find(mark)
while origin > 8:
end = Txt.find(after, origin)
circuit.append(Txt[origin:end])
origin = Txt.find(mark, end)
return circuit
def get_RegexGetTextLine(self, Text, RegexText, Index):
returnTxt = []
pattern = re.compile(RegexText, re.M | re.S)
ListRe = pattern.findall(Text)
if len(ListRe) < 1:
return returnTxt
for value in ListRe:
returnTxt.append(value)
return returnTxt
def searchContent(self, key, quick, pg=1):
key = urllib.parse.quote(key)
Url = 'https://search.cctv.com/ifsearch.php?page=1&qtext={0}&sort=relevance&pageSize=20&type=video&vtime=-1&datepid=1&channel=&pageflag=0&qtext_str={0}'.format(
key)
# htmlTxt = self.webReadFile(urlStr=Url, header=self.header)
htmlTxt = self.fetch(Url).text
videos = self.get_list_search(html=htmlTxt, tid='搜索')
result = {
'list': videos
}
return result
def playerContent(self, flag, id, vipFlags):
result = {}
url = ''
parse = 0
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
}
if flag == 'CCTV':
url = self.get_m3u8(urlTxt=id)
elif flag == '道长在线直播':
# _url = id
title = id.split('||')[0] # 获取标题
_url = f'https://vdn.live.cntv.cn/api2/liveHtml5.do?channel=pc://cctv_p2p_hd{title}&channel_id={title}'
htmlTxt = self.fetch(_url).text
# print(htmlTxt)
vdata = self.regStr(htmlTxt, "var .*?=.*?'(.*?)';")
vdata = self.str2json(vdata)
print(vdata)
url = vdata['hls_url']['hls1']
print(url)
url = self.fixm3u8_url(url)
else:
try:
# htmlTxt = self.webReadFile(urlStr=id, header=self.header)
htmlTxt = self.fetch(id).text
guid = self.get_RegexGetText(Text=htmlTxt, RegexText=r'var\sguid\s*=\s*"(.+?)";', Index=1)
url = self.get_m3u8(urlTxt=guid)
except:
url = id
parse = 1
if url.find('https:') < 0:
url = id
parse = 1
result["parse"] = parse # 1=嗅探,0=播放
result["playUrl"] = ''
result["url"] = url
result["header"] = headers
return result
# 分类抓取地址:
# 栏目大全:https://tv.cctv.com/lm/index.shtml?spm=C28340.Pu9TN9YUsfNZ.E2PQtIunpEaz.24
# 电视剧:https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65#datacid=dsj&datafl=&datadq=&fc=%E7%94%B5%E8%A7%86%E5%89%A7&datanf=&dataszm=
# 动画片:https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65#datacid=dhp&datafl=&datadq=&fc=%E5%8A%A8%E7%94%BB%E7%89%87&dataszm=
# 记录片:https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65#datacid=jlp&datapd=&datafl=&fc=%E7%BA%AA%E5%BD%95%E7%89%87&datanf=&dataszm=
# 特别节目:https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65#datacid=tbjm&datapd=&datafl=&fc=%E7%89%B9%E5%88%AB%E8%8A%82%E7%9B%AE&datajr=&dataszm=
config = {
"player": {},
"filter": {
"电视剧": [
{"key": "datafl-sc", "name": "类型",
"value": [{"n": "全部", "v": ""}, {"n": "谍战", "v": "谍战"}, {"n": "悬疑", "v": "悬疑"},
{"n": "刑侦", "v": "刑侦"}, {"n": "历史", "v": "历史"}, {"n": "古装", "v": "古装"},
{"n": "武侠", "v": "武侠"}, {"n": "军旅", "v": "军旅"}, {"n": "战争", "v": "战争"},
{"n": "喜剧", "v": "喜剧"}, {"n": "青春", "v": "青春"}, {"n": "言情", "v": "言情"},
{"n": "偶像", "v": "偶像"}, {"n": "家庭", "v": "家庭"}, {"n": "年代", "v": "年代"},
{"n": "革命", "v": "革命"}, {"n": "农村", "v": "农村"}, {"n": "都市", "v": "都市"},
{"n": "其他", "v": "其他"}]},
{"key": "datadq-area", "name": "地区",
"value": [{"n": "全部", "v": ""}, {"n": "中国大陆", "v": "中国大陆"}, {"n": "中国香港", "v": "香港"},
{"n": "美国", "v": "美国"}, {"n": "欧洲", "v": "欧洲"}, {"n": "泰国", "v": "泰国"}]},
{"key": "datanf-year", "name": "年份",
"value": [{"n": "全部", "v": ""}, {"n": "2024", "v": "2024"}, {"n": "2023", "v": "2023"},
{"n": "2022", "v": "2022"},
{"n": "2021", "v": "2021"}, {"n": "2020", "v": "2020"}, {"n": "2019", "v": "2019"},
{"n": "2018", "v": "2018"}, {"n": "2017", "v": "2017"}, {"n": "2016", "v": "2016"},
{"n": "2015", "v": "2015"}, {"n": "2014", "v": "2014"}, {"n": "2013", "v": "2013"},
{"n": "2012", "v": "2012"}, {"n": "2011", "v": "2011"}, {"n": "2010", "v": "2010"},
{"n": "2009", "v": "2009"}, {"n": "2008", "v": "2008"}, {"n": "2007", "v": "2007"},
{"n": "2006", "v": "2006"}, {"n": "2005", "v": "2005"}, {"n": "2004", "v": "2004"},
{"n": "2003", "v": "2003"}, {"n": "2002", "v": "2002"}, {"n": "2001", "v": "2001"},
{"n": "2000", "v": "2000"}, {"n": "1999", "v": "1999"}, {"n": "1998", "v": "1998"},
{"n": "1997", "v": "1997"}]},
{"key": "dataszm-letter", "name": "字母",
"value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "C", "v": "C"}, {"n": "E", "v": "E"},
{"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"}, {"n": "I", "v": "I"},
{"n": "J", "v": "J"}, {"n": "K", "v": "K"}, {"n": "L", "v": "L"}, {"n": "M", "v": "M"},
{"n": "N", "v": "N"}, {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
{"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"}, {"n": "U", "v": "U"},
{"n": "V", "v": "V"}, {"n": "W", "v": "W"}, {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"},
{"n": "Z", "v": "Z"}, {"n": "0-9", "v": "0-9"}]}
],
"动画片": [
{"key": "datafl-sc", "name": "类型",
"value": [{"n": "全部", "v": ""}, {"n": "亲子", "v": "亲子"}, {"n": "搞笑", "v": "搞笑"},
{"n": "冒险", "v": "冒险"}, {"n": "动作", "v": "动作"}, {"n": "宠物", "v": "宠物"},
{"n": "体育", "v": "体育"}, {"n": "益智", "v": "益智"}, {"n": "历史", "v": "历史"},
{"n": "教育", "v": "教育"}, {"n": "校园", "v": "校园"}, {"n": "言情", "v": "言情"},
{"n": "武侠", "v": "武侠"}, {"n": "经典", "v": "经典"}, {"n": "未来", "v": "未来"},
{"n": "古代", "v": "古代"}, {"n": "神话", "v": "神话"}, {"n": "真人", "v": "真人"},
{"n": "励志", "v": "励志"}, {"n": "热血", "v": "热血"}, {"n": "奇幻", "v": "奇幻"},
{"n": "童话", "v": "童话"}, {"n": "剧情", "v": "剧情"}, {"n": "夺宝", "v": "夺宝"},
{"n": "其他", "v": "其他"}]},
{"key": "datadq-area", "name": "地区",
"value": [{"n": "全部", "v": ""}, {"n": "中国大陆", "v": "中国大陆"}, {"n": "美国", "v": "美国"},
{"n": "欧洲", "v": "欧洲"}]},
{"key": "dataszm-letter", "name": "字母",
"value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "C", "v": "C"}, {"n": "E", "v": "E"},
{"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"}, {"n": "I", "v": "I"},
{"n": "J", "v": "J"}, {"n": "K", "v": "K"}, {"n": "L", "v": "L"}, {"n": "M", "v": "M"},
{"n": "N", "v": "N"}, {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
{"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"}, {"n": "U", "v": "U"},
{"n": "V", "v": "V"}, {"n": "W", "v": "W"}, {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"},
{"n": "Z", "v": "Z"}, {"n": "0-9", "v": "0-9"}]}
],
"纪录片": [
{"key": "datafl-sc", "name": "类型",
"value": [{"n": "全部", "v": ""}, {"n": "人文历史", "v": "人文历史"}, {"n": "人物", "v": "人物"},
{"n": "军事", "v": "军事"}, {"n": "探索", "v": "探索"}, {"n": "社会", "v": "社会"},
{"n": "时政", "v": "时政"}, {"n": "经济", "v": "经济"}, {"n": "科技", "v": "科技"}]},
{"key": "datanf-year", "name": "年份",
"value": [{"n": "全部", "v": ""}, {"n": "2024", "v": "2024"}, {"n": "2023", "v": "2023"},
{"n": "2022", "v": "2022"},
{"n": "2021", "v": "2021"}, {"n": "2020", "v": "2020"}, {"n": "2019", "v": "2019"},
{"n": "2018", "v": "2018"}, {"n": "2017", "v": "2017"}, {"n": "2016", "v": "2016"},
{"n": "2015", "v": "2015"}, {"n": "2014", "v": "2014"}, {"n": "2013", "v": "2013"},
{"n": "2012", "v": "2012"}, {"n": "2011", "v": "2011"}, {"n": "2010", "v": "2010"},
{"n": "2009", "v": "2009"}, {"n": "2008", "v": "2008"}]},
{"key": "dataszm-letter", "name": "字母",
"value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "C", "v": "C"}, {"n": "E", "v": "E"},
{"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"}, {"n": "I", "v": "I"},
{"n": "J", "v": "J"}, {"n": "K", "v": "K"}, {"n": "L", "v": "L"}, {"n": "M", "v": "M"},
{"n": "N", "v": "N"}, {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
{"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"}, {"n": "U", "v": "U"},
{"n": "V", "v": "V"}, {"n": "W", "v": "W"}, {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"},
{"n": "Z", "v": "Z"}, {"n": "0-9", "v": "0-9"}]}
],
"特别节目": [
{"key": "datafl-sc", "name": "类型",
"value": [{"n": "全部", "v": ""}, {"n": "全部", "v": "全部"}, {"n": "新闻", "v": "新闻"},
{"n": "经济", "v": "经济"}, {"n": "综艺", "v": "综艺"}, {"n": "体育", "v": "体育"},
{"n": "军事", "v": "军事"}, {"n": "影视", "v": "影视"}, {"n": "科教", "v": "科教"},
{"n": "戏曲", "v": "戏曲"}, {"n": "青少", "v": "青少"}, {"n": "音乐", "v": "音乐"},
{"n": "社会", "v": "社会"}, {"n": "公益", "v": "公益"}, {"n": "其他", "v": "其他"}]},
{"key": "dataszm-letter", "name": "字母",
"value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "C", "v": "C"}, {"n": "E", "v": "E"},
{"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"}, {"n": "I", "v": "I"},
{"n": "J", "v": "J"}, {"n": "K", "v": "K"}, {"n": "L", "v": "L"}, {"n": "M", "v": "M"},
{"n": "N", "v": "N"}, {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
{"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"}, {"n": "U", "v": "U"},
{"n": "V", "v": "V"}, {"n": "W", "v": "W"}, {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"},
{"n": "Z", "v": "Z"}, {"n": "0-9", "v": "0-9"}]}
],
"栏目大全": [{"key": "cid", "name": "频道",
"value": [{"n": "全部", "v": ""}, {"n": "CCTV-1综合", "v": "EPGC1386744804340101"},
{"n": "CCTV-2财经", "v": "EPGC1386744804340102"},
{"n": "CCTV-3综艺", "v": "EPGC1386744804340103"},
{"n": "CCTV-4中文国际", "v": "EPGC1386744804340104"},
{"n": "CCTV-5体育", "v": "EPGC1386744804340107"},
{"n": "CCTV-6电影", "v": "EPGC1386744804340108"},
{"n": "CCTV-7国防军事", "v": "EPGC1386744804340109"},
{"n": "CCTV-8电视剧", "v": "EPGC1386744804340110"},
{"n": "CCTV-9纪录", "v": "EPGC1386744804340112"},
{"n": "CCTV-10科教", "v": "EPGC1386744804340113"},
{"n": "CCTV-11戏曲", "v": "EPGC1386744804340114"},
{"n": "CCTV-12社会与法", "v": "EPGC1386744804340115"},
{"n": "CCTV-13新闻", "v": "EPGC1386744804340116"},
{"n": "CCTV-14少儿", "v": "EPGC1386744804340117"},
{"n": "CCTV-15音乐", "v": "EPGC1386744804340118"},
{"n": "CCTV-16奥林匹克", "v": "EPGC1634630207058998"},
{"n": "CCTV-17农业农村", "v": "EPGC1563932742616872"},
{"n": "CCTV-5+体育赛事", "v": "EPGC1468294755566101"}]},
{"key": "fc", "name": "分类",
"value": [{"n": "全部", "v": ""}, {"n": "新闻", "v": "新闻"}, {"n": "体育", "v": "体育"},
{"n": "综艺", "v": "综艺"}, {"n": "健康", "v": "健康"}, {"n": "生活", "v": "生活"},
{"n": "科教", "v": "科教"}, {"n": "经济", "v": "经济"}, {"n": "农业", "v": "农业"},
{"n": "法治", "v": "法治"}, {"n": "军事", "v": "军事"}, {"n": "少儿", "v": "少儿"},
{"n": "动画", "v": "动画"}, {"n": "纪实", "v": "纪实"}, {"n": "戏曲", "v": "戏曲"},
{"n": "音乐", "v": "音乐"}, {"n": "影视", "v": "影视"}]},
{"key": "fl", "name": "字母",
"value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "B", "v": "B"},
{"n": "C", "v": "C"}, {"n": "D", "v": "D"}, {"n": "E", "v": "E"},
{"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"},
{"n": "I", "v": "I"}, {"n": "J", "v": "J"}, {"n": "K", "v": "K"},
{"n": "L", "v": "L"}, {"n": "M", "v": "M"}, {"n": "N", "v": "N"},
{"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
{"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"},
{"n": "U", "v": "U"}, {"n": "V", "v": "V"}, {"n": "W", "v": "W"},
{"n": "X", "v": "X"}, {"n": "Y", "v": "Y"}, {"n": "Z", "v": "Z"}]},
]
}
}
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
"Host": "tv.cctv.com",
"Referer": "https://tv.cctv.com/"
}
def localProxy(self, params):
return [200, "video/MP2T", ""]
# -----------------------------------------------自定义函数-----------------------------------------------
# 访问网页
def webReadFile(self, urlStr, header):
html = ''
req = urllib.request.Request(url=urlStr) # ,headers=header
with urllib.request.urlopen(req) as response:
html = response.read().decode('utf-8')
return html
# 判断网络地址是否存在
def TestWebPage(self, urlStr, header):
html = ''
req = urllib.request.Request(url=urlStr, method='HEAD') # ,headers=header
with urllib.request.urlopen(req) as response:
html = response.getcode()
return html
# 正则取文本
def get_RegexGetText(self, Text, RegexText, Index):
returnTxt = ""
Regex = re.search(RegexText, Text, re.M | re.S)
if Regex is None:
returnTxt = ""
else:
returnTxt = Regex.group(Index)
return returnTxt
# 取集数
def get_EpisodesList(self, jsonList):
videos = []
for vod in jsonList:
url = vod['guid']
title = vod['title']
if len(url) == 0:
continue
videos.append(title + "$" + url)
return videos
# 取集数
def get_EpisodesList_re(self, htmlTxt, patternTxt):
ListRe = re.finditer(patternTxt, htmlTxt, re.M | re.S)
videos = []
for vod in ListRe:
url = vod.group('url')
title = vod.group('title')
if len(url) == 0:
continue
videos.append(title + "$" + url)
return videos
# 取剧集区
def get_lineList(self, Txt, mark, after):
circuit = []
origin = Txt.find(mark)
while origin > 8:
end = Txt.find(after, origin)
circuit.append(Txt[origin:end])
origin = Txt.find(mark, end)
return circuit
# 正则取文本,返回数组
def get_RegexGetTextLine(self, Text, RegexText, Index):
returnTxt = []
pattern = re.compile(RegexText, re.M | re.S)
ListRe = pattern.findall(Text)
if len(ListRe) < 1:
return returnTxt
for value in ListRe:
returnTxt.append(value)
return returnTxt
# 删除html标签
def removeHtml(self, txt):
soup = re.compile(r'<[^>]+>', re.S)
txt = soup.sub('', txt)
return txt.replace("&nbsp;", " ")
def hookM3u8(self, url):
"""
https://www.52pojie.cn/thread-1932358-1-1.html
JavaScript:$.ajaxSettings.async = false; var s = ""; let a = $.get(vodh5player.playerList[0].ads.contentSrc); for (var m = 0; m < a.responseText.match(/asp.*?m3u8/g).length; m++) { s = s + "https://hls.cntv.myalicdn.com//asp" + a.responseText.match(/asp.*?m3u8/g)[m].slice(7) + "\n\n" }; var blob = new Blob([s], { type: "text/plain" }); var url = URL.createObjectURL(blob); window.open(url);
@param url:
@return:
"""
url = url or ''
hook1 = lambda x: x.replace('asp/', 'asp//', 1)
hook2 = lambda x: x.replace('hls/', 'hls//', 1)
hook3 = lambda x: x.replace('https://newcntv.qcloudcdn.com', 'https://hls.cntv.myalicdn.com/', 1)
hooks = [hook1, hook2, hook3]
hook = random.choice(hooks)
return hook(url)
# 取m3u8
def get_m3u8(self, urlTxt):
"""
https://blog.csdn.net/panwang666/article/details/135347859
JavaScript:jQuery.getJSON("https://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid="+guid,function(result){document.writeln(result.hls_url.link(result.hls_url));});
https://newcntv.qcloudcdn.com/asp/hls/main/0303000a/3/default/3628bb15af644f588dc91ec68425b9ac/main.m3u8?maxbr=2048
@param urlTxt:
@return:
"""
url = "https://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid={0}".format(urlTxt)
# htmlTxt = self.webReadFile(urlStr=url, header=self.header)
htmlTxt = self.fetch(url).text
jo = json.loads(htmlTxt)
link = jo['hls_url'].strip()
# print('hls_url:',link)
# 获取域名前缀
urlPrefix = self.get_RegexGetText(Text=link, RegexText='(http[s]?://[a-zA-z0-9.]+)/', Index=1)
# 域名前缀指定替换,然后可以获取到更高质量的视频列表
# /asp/h5e/hls/2000/0303000a/3/default/3628bb15af644f588dc91ec68425b9ac/2000.m3u8
new_link = link.replace(f'{urlPrefix}/asp/hls/', 'https://dh5.cntv.qcloudcdn.com/asp/h5e/hls/').split('?')[0]
# print('new_link:',new_link)
html = self.webReadFile(urlStr=new_link, header=self.header)
content = html.strip()
arr = content.split('\n')
subUrl = arr[-1].split('/')
# hdUrl = urlPrefix + arr[-1]
# subUrl[3] = '2000'
# subUrl[-1] = '2000.m3u8'
# hdUrl = urlPrefix + '/'.join(subUrl)
maxVideo = subUrl[-1].replace('.m3u8', '')
hdUrl = link.replace('main', maxVideo)
hdUrl = hdUrl.replace(urlPrefix, 'https://newcntv.qcloudcdn.com')
hdRsp = self.TestWebPage(urlStr=hdUrl, header=self.header)
if hdRsp == 200:
url = hdUrl.split('?')[0]
url = self.hookM3u8(url)
self.log(f'视频链接: {url}')
else:
url = ''
return url
def fixm3u8_url(self, url):
# 获取域名前缀
urlPrefix = self.get_RegexGetText(Text=url, RegexText='(http[s]?://[a-zA-z0-9.]+)/', Index=1)
# 域名前缀指定替换,然后可以获取到更高质量的视频列表
new_link = url.split('?')[0]
# print(new_link)
html = self.webReadFile(urlStr=new_link, header=self.header)
content = html.strip()
# print(content)
arr = content.split('\n')
subUrl = arr[3] if 'EXT-X-VERSION' in content else arr[2]
hdUrl = self.urljoin(new_link, subUrl).split('?')[0]
# hdUrl = hdUrl.replace(urlPrefix, 'https://newcntv.qcloudcdn.com')
hdRsp = self.TestWebPage(urlStr=hdUrl, header=self.header)
if hdRsp == 200:
url = hdUrl
self.log(f'视频链接: {url}')
else:
url = ''
return url
# 搜索
def get_list_search(self, html, tid):
jRoot = json.loads(html)
jsonList = jRoot['list']
videos = []
for vod in jsonList:
url = vod['urllink']
title = self.removeHtml(txt=vod['title'])
img = vod['imglink']
id = vod['id']
brief = vod['channel']
year = vod['uploadtime']
if len(url) == 0:
continue
guids = [tid, title, url, img, id, year, '', brief]
guid = "||".join(guids)
videos.append({
"vod_id": guid,
"vod_name": title,
"vod_pic": img,
"vod_remarks": year
})
return videos
def get_list1(self, html, tid, year_prefix=None):
jRoot = json.loads(html)
videos = []
data = jRoot['response']
if data is None:
return []
jsonList = data['docs']
for vod in jsonList:
id = vod['lastVIDE']['videoSharedCode']
desc = vod['lastVIDE']['videoTitle']
title = vod['column_name']
url = vod['column_website']
img = vod['column_logo']
year = vod['column_playdate']
brief = vod['column_brief']
actors = ''
if len(url) == 0:
continue
guids = [tid, title, url, img, id, year, actors, brief]
guid = "||".join(guids)
# print(vod_id)
videos.append({
"vod_id": year_prefix + '$$$' + guid if year_prefix else guid,
"vod_name": title,
"vod_pic": img,
"vod_remarks": desc.split('')[1].strip() if '' in desc else desc.strip()
})
# print(videos)
return videos
# 分类取结果
def get_list(self, html, tid):
jRoot = json.loads(html)
videos = []
data = jRoot['data']
if data is None:
return []
jsonList = data['list']
for vod in jsonList:
url = vod['url']
title = vod['title']
img = vod['image']
id = vod['id']
try:
brief = vod['brief']
except:
brief = ''
try:
year = vod['year']
except:
year = ''
try:
actors = vod['actors']
except:
actors = ''
if len(url) == 0:
continue
guids = [tid, title, url, img, id, year, actors, brief]
guid = "||".join(guids)
# print(vod_id)
videos.append({
"vod_id": guid,
"vod_name": title,
"vod_pic": img,
"vod_remarks": ''
})
return videos
# 4k分类取结果
def get_list_4k(self, html, tid):
jRoot = json.loads(html)
videos = []
data = jRoot['data']
if data is None:
return []
jsonList = data['list']
for vod in jsonList:
vod_remarks = vod['title']
id = vod['id']
vod = vod['last_video']
img = vod['image']
url = vod['url']
title = vod['title']
brief = vod.get('brief') or ''
year = vod.get('year') or ''
actors = vod.get('actors') or ''
if len(url) == 0:
continue
guids = [tid, title, url, img, id, year, actors, brief]
guid = "||".join(guids)
# print(vod_id)
videos.append({
"vod_id": guid,
"vod_name": title,
"vod_pic": img,
"vod_remarks": vod_remarks
})
return videos
if __name__ == '__main__':
from t4.core.loader import t4_spider_init
spider = Spider()
t4_spider_init(spider)
# print(spider.homeContent(True))
# print(spider.homeVideoContent())
# spider.init_api_ext_file()
# url = 'https://api.cntv.cn/lanmu/columnSearch?&fl=&fc=%E6%96%B0%E9%97%BB&cid=&p=1&n=20&serviceId=tvcctv&t=jsonp&cb=Callback'
# url = 'https://api.cntv.cn/lanmu/columnSearch?&fl=&fc=&cid=&p=1&n=20&serviceId=tvcctv&t=json&cb=ko'
# r = spider.fetch(url)
# print(r.text)
# home_content = spider.homeContent(None)
# print(home_content)
cate_content = spider.categoryContent('栏目大全', 1, {'cid': 'n'}, {})
# cate_content = spider.categoryContent('频道直播', 1, None, None)
print(cate_content)
vid = cate_content['list'][0]['vod_id']
print(vid)
detail_content = spider.detailContent([vid])
print(detail_content)
# #
vod_play_from = detail_content['list'][0]['vod_play_from']
vod_play_url = detail_content['list'][0]['vod_play_url']
print(vod_play_from, vod_play_url)
_url = vod_play_url.split('#')[0].split('$')[1]
print(_url)
print('vod_play_from:', vod_play_from, ' vod_play_url:', _url)
play = spider.playerContent(vod_play_from, _url, None)
print(play)
# play = spider.playerContent('道长在线直播', 'cctv1||https://tv.cctv.com/live/cctv1/', None)
# print(play)