【问题标题】:Needing help with analyzing HTTP form在分析 HTTP 表单方面需要帮助
【发布时间】:2011-03-27 14:37:24
【问题描述】:


为了训练自己使用 php 和 HTML 表单,我决定制作一个小型 Web 应用程序,它从另一个网站收集数据,但将其显示在移动设备上。

在本次练习中,我选择了我所在地区的公交公司站点:http://delijn.be/en/index.htm。我分析了网站,发现了一个名为“form1”的表单,它通过POST方法向网站发送数据:http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=en

我开始编写 php 代码并在互联网上发现您可以使用 cURL 发送 POST 字段。所以我做了。不幸的是,它不起作用。我得到了网站的错误页面。所以我猜有些字段一定是丢失了,但我已经检查了所有内容,但找不到另一个字段。就这样,我再次来到这里,寻求帮助。

Web 应用程序托管在 my home server 上,也可以是 downloaded 那里。

如果有人能帮我解决这个问题,我将不胜感激,
ief2


PS:部分代码是荷兰语写的,所以这里有一些翻译:
  • Gemeente = 城镇/城市
  • 普拉茨 = 位置
  • 数字 = 数字
  • 基准 = 日期
  • Dag = Day
  • Maand = 月
  • Jaar = 年
  • Uur = 小时
  • Aankomst = 到达
  • Vertrek = 出发
  • Berekenen = 计算


PPS:下载链接貌似失效了,但是我下载没问题,所以这里有一些代码片段:

index.php

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
    <head>
        <title>De Lijn Mobile</title>
        <meta name="viewport" content="width = device-width">
    </head>

    <body>
        <form name="main" action="calculateRoute.php" method="post">
            <b>Vertrek:</b><br>
            Gemeente: <input type="text" name="vertrekGemeente"><br>
            Straat: <input type="text" name="vertrekStraat"><br>
            Nummer: <input type="text" name="vertrekNummer"><br>
            <hr>
            <b>Aankomst:</b><br>
            Gemeente: <input type="text" name="aankomstGemeente"><br>
                Straat: <input type="text" name="aankomstStraat"><br>
            Nummer: <input type="text" name="aankomstNummer"><br>
            <hr>
            <b>Datum:</b><br>
            <?php
                require("./Date.php");
                $now = new Date();
            ?>
            <input type="radio" name="datumType" value="aankomst" checked> Aankomst<br>
            <input type="radio" name="datumType" value="vertrek"> Vertrek<br>
            Dag: <input type="text" size="2" name="datumDag" value="<?php echo $now->day; ?>"><br>
            Maand: <input type="text" size="2" name="datumMaand" value="<?php echo $now->month; ?>"><br>
            Jaar: <input type="text" size="4" name="datumJaar" value="<?php echo $now->year; ?>"><br>
            Tijdstip: <input type="text" size="2" name="datumUur" value="<?php echo $now->hour; ?>"> : 
            <input type="text" size="2" name="datumMinuten" value="<?php echo $now->minutes; ?>"><br>
            <hr>
            <input type="submit" value="Bereken"><br>
        </form>
    </body>
</html>

calculateRoute.php

<DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
    <head>
        <title>De Lijn Mobile - Berekeningen</title>
    </head>

    <body>
        <?php
            require_once("./Plaats.php");
            require_once("./Date.php");
            require_once("./DeLijn.php");

            echo "Gathering data...<br>";
            $gemeente = $_POST["vertrekGemeente"];
            $straat = $_POST["vertrekStraat"];
            $nummer = $_POST["vertrekNummer"];
            $vertrekPlaats = new Plaats($gemeente, $straat, $nummer);

            $gemeente = $_POST["aankomstGemeente"];
            $straat = $_POST["aankomstStraat"];
            $nummer = $_POST["aankomstNummer"];
            $aankomstPlaats = new Plaats($gemeente, $straat, $nummer);

            $datumType = $_POST["datumType"];
            $dag = $_POST["datumDag"];
            $maand = $_POST["datumMaand"];
            $jaar = $_POST["datumJaar"];
            $uur = $_POST["datumUur"];
            $min = $_POST["datumMinuten"];
            $datum = Date::withDate($jaar, $maand, $dag, $uur, $min);
            $datum->month = $maand;

            echo "Searching...<br>";
            searchDeLijn($vertrekPlaats,
                $aankomstPlaats,
                $datumType,
                $datum);

        ?>
    </body>
</html>

DeLijn.php

<?php

require_once("Route.php");
require_once("Date.php");
require_once("Plaats.php");

// ==== Returns of Route objects or null
define('DATE_ARRIVAL', "aankomst");
define('DATE_DEPARTURE', "vertrek");
function searchDeLijn($dep, $ar, $dateType, $date) {
    $vertrekkenOfAankomen = "aankomen";
    if(DATE_DEPARTURE === $dateType) {
        $vertrekkenOfAankomen = "vertrekken";
    }
    $myMins = (int)$date->minutes;
    $myMins -= ($myMins % 5);
    $postFields = array(
        "form1:vertrekGemeenteInput" => $dep->gemeente,
        "form1:vertrekStraatInput" => $dep->straat,
        "form1:vertrekNrInput" => $dep->nummer,

        "form1:aankomstGemeenteInput" => $ar->gemeente,
        "form1:aankomstStraatInput" => $ar->straat,
        "form1:aankomstNrInput" => $ar->nummer,

        "form1:vertrekkenOfAankomenRadio" => $vertrekkenOfAankomen,
        "form1:dagCombo" => (string)(int)$date->day,
        "form1:maandCombo" => (string)(int)$date->month,
        "form1:jaarCombo" => $date->year,
        "form1:uurCombo" => (string)(int)$date->hour,
        "form1:minutenCombo" => (string)$myMins);

    print_r($postFields);

    // do the curl
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL,
        'http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=nl');
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

    $contents = curl_exec($ch);
    curl_close($ch);
    if($contents == false) {
        return null;
    }

    echo $contents;

    $myRouteObjects = extractRoutesFromXMLData($contents);
    return $myRouteObjects;
}

// ==== Returns array of Route objects or null
function extractRoutesFromXMLData($dataString) {
    $tableBody = getResultsTableBody($dataString);
    if($tableBody != null) { return null; }

    $tableRows = getTableRowsOfTableBody($tableBody);
    if($tableRows != null) { return null; }

    // put them in an array
    $myArray = array();
    $count = $tableRows->length;
    for($i = 0; $i < $count; $i++) {
        $aNode = $tableRows->item(0);
        $myArray[] = $aNode;
    }

    return $myArray;
}

// ==== Returns XMLDocument or null
function getResultsTableBody($dataString) {
    // Get table element
    $status = preg_match('/<TABLE id="routeplanner_overzicht".*?>.*?<\/TABLE>/is',
            $docString, $matches);
    if($status == 0) {
        return null;
    }

    $tableElement = $matches[0];

    // Extract body
    $status = preg_match('/<TBODY>.*?<\/TBODY>/is',
            $tableElement, $matches);
    if($status == 0) {
        return null;
    }

    $doc = new DOMDocument();
    $doc->loadXML($matches[0]);

    return $doc;
}


// ==== Retunrs XMLNodeList or null
function getTableRowsOfTableBody($xmlDoc) {
    $xpath = new DOMXPath($domDoc);
    $xpathres = $xpath->evaluate("//tbody[0]/tr");
    if($xpathres == false) {
        return null;
    }

    return $xpathres;
}
?>

Date.phpPlaats.phpRoute.php 都包含分别封装日期、位置和可能路线的类。

【问题讨论】:

  • document.forms[1].elements.length 说有 14 个,但您只列出了 11 个。此外,有些网站拒绝在没有 cookie 的情况下运行(一个跟踪和两个 javascript 测试)。查看 Firebug 网络流量。
  • 我确实忘记了 13 个中的一个(有一个收音机),但您正在查看 Route.php 那是结果解析器,它有一个包含 11 列的表。但无论如何,我检查了饼干,我认为你是对的。当我禁用 cookie 时,网站会显示“Uw sessie is verlopen”(=“您的会话已过期”)。现在可以手工制作这样的饼干吗?还是有其他方法可以解决这个问题?
  • 没有查看您的代码。下载链接无效。最好在此处粘贴摘录。 -- cURL 允许以某种方式设置 cookie,请参阅各种 CURLOPT_COOKIE* 标志。我认为这是此类任务最常见的问题。

标签: php html post curl


【解决方案1】:

缺少字段,服务器对发布数据的响应确实很奇怪。我只能自动化一页。要点击其他链接,cookie 显然是不够的。

我编写了一些代码,可能对其他需要了解表单布局的人有用:

HTMLFormExtractor.py

#!/usr/bin/python
import sys
import getopt
import urllib
import re

# ############################
# This code may be used by anyone. It may be used in both free
# and commercial software. It may be copied, modified and even
# be sold. The creator of this code takes no responsibility for
# any damage this script could do.
# ############################

# ############################
# ############################
# Usage: ./exec [-x] [URL]
# 
# This application logs all forms of an HTML document and it's
# objects which have the HTML 'name'-attribute set. The program
# currently only works when the attributes of the objects are
# styled like the XML format (eg: name="myname").
# 
# Options:
#   -x: Create an XML document of the following form:
#           ==== BEGIN XML ====
#           formlist
#               form (variable)
#                   attribute (variable)
#                       name
#                       value
#
#                   object (variable)
#                       type (eg: input)
#                       name (eg: username)
#           ==== END XML ====
#
#   URL: a URL pointing to an available, HTML file. If it's not
#       specified specified the program will read the HTML document 
#       from the standard input.
#
# ############################

# ===== DATA =====
global FORM_OBJECTS_TAG_NAME
FORM_OBJECTS_TAG_NAME = ("input", 
    "textarea", 
    "label", 
    "fieldset", 
    "legend", 
    "select", 
    "optgroup", 
    "option", 
    "button")



# ===== CLASSES =====
class HTMLAttribute:
    def __init__(self, name, value, orString = None):
        self.name = name
        self.value = value
        self.originalString = None

    @classmethod
    def withAttributeString(cls, string):
        """Takes a string of the form attrNam="value" """
        attrNameRegex = "\w+="
        attrName = re.findall(attrNameRegex, string)[0]
        attrName = attrName[0:len(attrName)-1]

        valueRegex = "[\"'].*?[\"']"
        value = re.findall(valueRegex, string)[0]
        value = value[1:len(value)-1]

        return cls(attrName, value, string)

class HTMLObject:
    def __init__(self, aName):
        self.name = aName
        self.attributes = [] # contains HTMLAttribute

    def addAttribute(self, anAttribute):
        self.attributes.append(anAttribute)

    def getAttributeWithName(self, aName):
        """Returns none or an HTLMAttribute"""
        aName = aName.lower()
        for anAttribute in self.attributes:
            if anAttribute.name.lower() == aName: return anAttribute
        return None

    @classmethod
    def withTagString(cls, string):
        """Takes a string of the form <aTagName attrName="value" ... >"""
        tagOnyRegex = "<.*?>"
        regObj = re.compile(tagOnyRegex, re.S)
        string = re.findall(regObj, string)[0]

        tagNameRegex = "(?<=<)\w+[\s>]"
        tagName = re.findall(tagNameRegex, string)[0]
        tagName = tagName[0:len(tagName)-1]

        attrRegex = "\w+=[\"'].*?[\"']"
        allAttributes = re.findall(attrRegex, string)

        myObj = cls(tagName)
        for anAttrString in allAttributes:
            attrObj = HTMLAttribute.withAttributeString(anAttrString)
            myObj.addAttribute(attrObj)

        return myObj

class HTMLForm:
    def __init__(self, name, htmlObjects):
        self.name = name
        self.HTMLObjects = htmlObjects # list of HTMLObject

# ===== FUNCTIONS =====
def getFormsFromHTML(htmlData):
    regex = re.compile("<form.*?>.*?</form>", re.IGNORECASE | re.S)
    result = re.findall(regex, htmlData)
    return result

def getFormObjects(aForm):
    """Returns a list of HTMLObjects"""
    global FORM_OBJECTS_TAG_NAME
    myRegex = "<(?:"
    myOrRegexLen = len(myRegex)
    for aTagName in FORM_OBJECTS_TAG_NAME:
        myRegex += aTagName + "|"
    if len(myRegex) == myOrRegexLen: return []

    myRegex = myRegex[0:len(myRegex)-1]
    myRegex += ").*?>"

    regObj = re.compile(myRegex, re.S | re.I)
    allObjects = re.findall(regObj, aForm)

    foundObjects = []
    for anObject in allObjects:
        anObj = HTMLObject.withTagString(anObject)
        foundObjects.append(anObj)

    return foundObjects

def printForms(foundForms, foundObjects):
    """Pass on a list of HTMLObject and a list of lists of HTMLObjects
    The first list are the forms the second are the objects contained by
    the forms at the corresponding index of the first list."""
    counter = 0
    for aForm in foundForms:
        print "===== FORM " + str(counter+1) + " ====="

        print "\tATTRIBUTES:"
        for anAttribute in aForm.attributes:
            print "\t\t" + anAttribute.name + ": '" + anAttribute.value + "'"

        print "\n\t" + str(len(foundObjects)) + " OBJECTS:"
        for anObject in foundObjects[counter]:
            nameAttribute = anObject.getAttributeWithName("name")
            if nameAttribute != None:
                print "\t\t" + anObject.name + " (name=\"" + nameAttribute.value + "\")"

        print "\n"
        counter += 1


def createXMLString(foundForms, foundObjects):
    """Pass on a list of HTMLObject and a list of lists of HTMLObjects
    The first list are the forms the second are the objects contained by
    the forms at the corresponding index of the first list.

    XML:
        formlist
            form (mult)
                attribute (mult)
                    name
                    value

                object (mult)
                    type (eg: input)
                    name (eg: username)
    """
    counter = 0
    xmlString = "<formlist>\n"
    for aForm in foundForms:
        # make form child
        formXMLChild = "\t<form>\n"

        # add all attributes
        for anAttr in aForm.attributes:
            formXMLChild += "\t\t<attribute>\n"
            formXMLChild += "\t\t\t<name>" + anAttr.name + "</name>\n"
            formXMLChild += "\t\t\t<value>" + anAttr.value + "</value>\n"
            formXMLChild += "\t\t</attribute>\n"

        # add all input objects if they have a name
        for anObject in foundObjects[counter]:
            nameAttr = anObject.getAttributeWithName("name")
            if nameAttr != None:
                formXMLChild += "\t\t<object>\n"
                formXMLChild += "\t\t\t<type>" + anObject.name + "</type>\n"
                formXMLChild += "\t\t\t<name>" + nameAttr.value + "</name>\n"
                formXMLChild += "\t\t</object>\n"

        # end child and append
        formXMLChild += "\t<form>\n\n"
        xmlString += formXMLChild
        counter += 1

    # end xml and return the string
    xmlString = xmlString[0:len(xmlString)-1] + "</formlist>\n"
    return xmlString


# ===== MAIN =====
# Parse the command line options
userArgv = sys.argv[1:]
flags, arguments = getopt.getopt(userArgv, "x")
wantsXMLFormat = flags.count(('-x', '')) > 0
hasURL = len(arguments) > 0;

# Get the HTML data
myHTML = None;
if hasURL:
    myURL = arguments[0];
    urlHandle = urllib.urlopen(myURL)
    if urlHandle == None:
        print "Failed to open the URL"
        sys.exit(1)
    myHTML = urlHandle.read()
    urlHandle.close()

else:
    myHTML = sys.stdin.read()

# Get all forms
htmlForms = getFormsFromHTML(myHTML)

# Loop with all forms
foundForms = []
foundObjects = [] # list of list
for aFormTag in htmlForms:
    # append the form
    formChilds = getFormObjects(aFormTag)
    formHTMLObject = HTMLObject.withTagString(aFormTag)
    foundForms.append(formHTMLObject)

    # append a form input object
    allObjects = getFormObjects(aFormTag)
    foundObjects.append(allObjects)


# Print or create xml
if not wantsXMLFormat:
    printForms(foundForms, foundObjects)
else:
    myXMLString = createXMLString(foundForms, foundObjects)
    print myXMLString

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2011-03-04
    • 2017-03-02
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多