【发布时间】:2017-10-02 00:16:52
【问题描述】:
我正在使用Simple HTML DOM 来为个人项目抓取 EPG 数据。
目前,代码会抓取每个频道的数据,并将其转储到 json 文件中,我通过添加我自己的 $Channels 来过滤所有抓取的数据,这会将抓取的数据限制为仅我特别请求的项目以及添加我自己的使用以下流链接...
$channels = array(
"ITV1 London" => "URL 1",
);
我想不出一种方法来避免每个通道的数据在输出的 json 文件中重复。因为我需要请求$channels,所以我可以过滤最终输出中显示的数据以及将我自己的链接添加到最终输出中。
if ($channels[$channel_name]) {
$channel = array();
完整代码
<?php
// Include the php dom parser
include_once 'simple_html_dom.php';
header('Content-type: application/json');
// Create DOM from URL or file
$curl = curl_init();
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER,1);
curl_setopt($curl, CURLOPT_URL, "http://tv24.co.uk");
$html=curl_exec($curl);
$dom = new simple_html_dom(null, true, true);
$html=$dom->load($html, true, true);
$channels = array(
"ITV1 London" => "URL 1"
);
$data = array();
foreach($html->find('section div') as $ul)
{
foreach($ul->find('div.channel-wrapper') as $show) {
$channel_name = $show->find('h2.name')[0]->plaintext;
if ($channels[$channel_name]) {
$channel = array();
$channel['channel'] =$channel_name ;
$channel['logo'] = $show->find('span.logo img')[0]->src;
$channel['thumb'] = explode("'", $show->find('div.program')[0]->style)[1];
$channel['on-now'] = $show->find('span.title a')[0]->plaintext;
$channel['on-now-time'] = $show->find('span.time')[0]->plaintext;
$channel['on-now-description'] = $show->find('span.description')[0]->plaintext;
$channel['up-next'] = $show->find('span.title a')[1]->plaintext;
$channel['up-next-time'] = $show->find('span.time')[1]->plaintext;
$channel['stream'] = $channels[$channel_name];
$data['data'][] = $channel;
}
}
}
echo json_encode($data);
$myFile = "output.json";
$fh = fopen($myFile, 'w') or die("error");
$stringData = json_encode($data);
fwrite($fh, $stringData);
fclose($fh);
?>
【问题讨论】:
标签: php json web-scraping simple-html-dom