这里有一个混淆配置:
http://videolectures.net/icml2015_liang_language_understanding/video/1/page.map
这个 url 本身是由 site_slug 和 video 字段构建的
var viipg = {
cfg: {
slug: 'icml2015_liang_language_understanding',
type: 'Lecture',
obj_id: 23694,
video: 1,
video_id: 23648,
videos: [1, 2],
chrome_colors: ["FFFFFF", "F21F1F"],
livepipe: '//videolectures.net',
site_slug: 'vln',
media_url: 'https://static.videolectures.net/r.1483388978/',
sentry: '//161ef9a5c3a14af1848399909b890522@sentry.viidea.com/6'
},
..........................
};
文件是用JS中的算法解码的。我在nodejs 脚本中重新安排了缩小的代码:
"use strict"
var b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789~-_.,;:<>!?*+=\"'#$%&/\\()\n\t ";
var c = b.length;
function d(a) {
var c, d = 0;
for (c = 0; c < a.length; c += 1){
d += b.indexOf(a[c]);
}
return d >=0 ? d : 0;
}
function e(a, e) {
var f, g, h = [];
e = d(e);
for (var g = 0; g < a.length; g += 1) {
f = b.indexOf(a[g]);
if(f > -1){
var ni = ((f - e - 6 * g) % c + c) % c;
h.push(b[ni]);
} else {
h.push(a[g]);
}
}
return h.join("");
}
var slug = "icml2015_liang_language_understanding";
require('http').get('http://videolectures.net/icml2015_liang_language_understanding/video/1/page.map', (res) => {
res.setEncoding('utf8');
res.on('data', function (body) {
console.log(e(body.substring(1),slug));
});
});
它运行良好并将文件解码为有效的xml:
<?xml version="1.0" standalone="yes"?>
<smil>
<head>
<meta name="title" content="Natural Language Understanding: Foundations and State-of-the-Art" />
<meta name="abstract" content="Building systems that can understand human language—being able to answer questions, follow instructions, carry on dialogues—has been a long-standing challenge since the early days of AI. Due to recent advances in machine learning, there is again renewed interest in taking on this formidable task. A major question is how one represents and learns the semantics (meaning) of natural language, to which there are only partial answers. The goal of this tutorial is (i) to describe the linguistic and statistical challenges that any system must address; and (ii) to describe the types of cutting edge approaches and the remaining open problems. Topics include distributional semantics (e.g., word vectors), frame semantics (e.g., semantic role labeling), model-theoretic semantics (e.g., semantic parsing), the role of context, grounding, neural networks, latent variables, and inference. The hope is that this unified presentation will clarify the landscape, and show that this is an exciting time for the machine learning community to engage in the problems in natural language understanding." />
<meta name="part" content="1" />
<meta name="date" content="Oct. 28, 2015" />
<meta name="type" content="Tutorial" />
<layout></layout>
</head>
<body>
<switch region="video" dur="0:58:19" type="v">
<video id="1001866" proto="rtmp" width="400" height="300" systemBitrate="416819" size="182348171" apptype="v" ext="mp4" type="video/mp4" streamer="rtmp://hydro2.videolectures.net/vod" src="mp4:v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4"/>
<video id="1001866" proto="m3u8" width="400" height="300" systemBitrate="416819" size="182348171" apptype="v" ext="mp4" type="application/x-mpegURL" src="http://hydro2.videolectures.net/vod/_definst_/mp4:v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4/playlist.m3u8"/>
<video id="1001866" proto="secure_download" width="400" height="300" systemBitrate="416819" size="182348171" apptype="v" ext="mp4" type="video/mp4" src="http://videolectures.net/site/secure_dl/0345805d8bef53e4f3b95b58b3b9a4c2/5f811e37/ae6se7y42if65i27glt3yfanffv5rvtk/tag=1001866/icml2015_liang_language_understanding_01_400x300_h264.mp4"/>
<video id="1001866" proto="http" width="400" height="300" systemBitrate="416819" size="182348171" apptype="v" ext="mp4" type="video/mp4" src="http://hydro.ijs.si/v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4"/>
<image src="http://hydro.ijs.si/v012/41/iff64giyif2niuz3bfwkkv6p7z7nko3g.jpg" width="400" height="300" type="screenshot"/>
<image src="http://hydro.ijs.si/v012/41/ihgisuqj4vmqfygwzwuf7ifugpihurga.jpg" width="156" height="96" type="thumbnail"/>
</switch>
</body>
</smil>
JS部分位于script-player.js和smile.min.js
以下python脚本从源页面提取JSON参数,获取配置,解码配置并解析xml以获取视频url:
import requests
from bs4 import BeautifulSoup
import re
import json
#extract JSON config
r = requests.get("http://videolectures.net/icml2015_liang_language_understanding/")
soup = BeautifulSoup(r.text, "html.parser")
extract = soup.findAll(text=re.compile("var\s+viipg\s*=\s*{"))[0]
extract = re.search(r".*cfg\s*:\s*({.*}),", extract)
jsObject = extract.group(1)
dict_str = lambda data : re.sub(r"([{,]\s*)([^\"':,]+)(\s*:)", r'\1"\2"\3', data).replace('\'', '"')
config = json.loads(dict_str(jsObject))
print(config)
#extract xml config
r = requests.get(f'http://videolectures.net/{config["slug"]}/video/{config["video"]}/page.map')
charList = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789~-_.,;:<>!?*+=\"'#$%&/\\()\n\t "
charLen = len(charList)
def updateSlug(a):
d = 0
for c in a:
d += charList.find(c)
return d if d >= 0 else 0
def parse(data, slug):
h = []
slug = updateSlug(slug);
for idx, c in enumerate(data):
f = charList.find(c)
if f > -1:
ni = (((f - slug - 6 * idx) % charLen) + charLen) % charLen
h.append(charList[ni])
else:
h.append(c)
return "".join(h)
#parse xml config
xmlConfig = parse(r.text[1:], config["slug"])
soup = BeautifulSoup(xmlConfig, features="xml")
allVideos = [(t["proto"], t["src"]) for t in soup.find_all("video")]
print(allVideos)
httpVideo = [t for t in allVideos if t[0]=="http"][0]
print(httpVideo)
Try this on repl.it
输出:
{'slug': 'icml2015_liang_language_understanding', 'type': 'Lecture', 'obj_id': 23694, 'video': 1, 'video_id': 23648, 'videos': [1, 2], 'chrome_colors': ['FFFFFF', 'F21F1F'], 'livepipe': '//videolectures.net', 'site_slug': 'vln', 'media_url': 'https://static.videolectures.net/r.1483388978/', 'sentry': '//161ef9a5c3a14af1848399909b890522@sentry.viidea.com/6'}
[('rtmp', 'mp4:v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4'), ('m3u8', 'http://hydro2.videolectures.net/vod/_definst_/mp4:v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4/playlist.m3u8'), ('secure_download', 'http://videolectures.net/site/secure_dl/14f30db64e8184a24b2bb4414dcfdc5d/5f8127f8/ae6se7y42if65i27glt3yfanffv5rvtk/tag=1001866/icml2015_liang_language_understanding_01_400x300_h264.mp4'), ('http', 'http://hydro.ijs.si/v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4')]
('http', 'http://hydro.ijs.si/v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4')