如何结合 boost::spirit::lex 和 boost::spirit::qi？答案

【问题标题】：How to combine boost::spirit::lex & boost::spirit::qi?如何结合 boost::spirit::lex 和 boost::spirit::qi？
【发布时间】：2012-11-15 10:21:23
【问题描述】：

我有一个词法分析器，基于该词法分析器，我现在想创建一个使用该词法分析器生成的标记的语法。我尝试调整我发现的一些示例，现在我有一些可以编译和工作的东西，但我的一个应该失败的测试没有。现在我想知道为什么，我也想知道我在那里实际做了什么（我想了解 - 我只是从一些示例中复制了一些代码，但这并没有真正提高理解）。

词法分析器：

#include <boost/spirit/include/lex_lexertl.hpp>

namespace lex = boost::spirit::lex;

enum LexerIDs { ID_IDENTIFIER, ID_WHITESPACE, ID_INTEGER, ID_FLOAT, ID_PUNCTUATOR };

template <typename Lexer>
struct custom_lexer : lex::lexer<Lexer>
{
    custom_lexer()
        : identifier("[a-zA-Z_][a-zA-Z0-9_]*")
        , white_space("[ \\t\\n]+")
        , integer_value("[1-9][0-9]*")
        , hex_value("0[xX][0-9a-fA-F]+")
        , float_value("[0-9]*\\.[0-9]+([eE][+-]?[0-9]+)?")
        , float_value2("[0-9]+\\.([eE][+-]?[0-9]+)?")
        , punctuator("\\[|\\]|\\(|\\)|\\.|&>|\\*\\*|\\*|\\+|-|~|!|\\/|%|<<|>>|<|>|<=|>=|==|!=|\\^|&|\\||\\^\\^|&&|\\|\\||\\?|:|,")// [ ] ( ) . &> ** * + - ~ ! / % << >> < > <= >= == != ^ & | ^^ && || ? : ,
    {
        using boost::spirit::lex::_start;
        using boost::spirit::lex::_end;

        this->self.add
            (identifier, ID_IDENTIFIER) 
            /*(white_space, ID_WHITESPACE)*/ 
            (integer_value, ID_INTEGER)
            (hex_value, ID_INTEGER)
            (float_value, ID_FLOAT)
            (float_value2, ID_FLOAT)
            (punctuator, ID_PUNCTUATOR);

        this->self("WS") = white_space;
    }
    lex::token_def<std::string> identifier;
    lex::token_def<lex::omit> white_space;
    lex::token_def<int> integer_value;
    lex::token_def<int> hex_value;
    lex::token_def<double> float_value;
    lex::token_def<double> float_value2;
    lex::token_def<> punctuator;
};

语法：

namespace qi  = boost::spirit::qi;
namespace lex = boost::spirit::lex;

template< typename Iterator, typename Lexer>
struct custom_grammar : qi::grammar<Iterator, qi::in_state_skipper<Lexer>>
{

    template< typename TokenDef >
    custom_grammar(const TokenDef& tok) : custom_grammar::base_type(ges)
    {
        ges = qi::token(ID_INTEGER) | qi::token(ID_FLOAT);
        BOOST_SPIRIT_DEBUG_NODE(ges);
        debug(ges);
    }
    qi::rule<Iterator, qi::in_state_skipper<Lexer>> ges;
};

还有例子：

BOOST_AUTO_TEST_CASE(BasicGrammar)
{
    namespace lex = boost::spirit::lex;
    namespace qi = boost::spirit::qi;

    std::string test("1234 56");

    typedef lex::lexertl::token<char const*, lex::omit, boost::mpl::true_> token_type;
    typedef lex::lexertl::lexer<token_type> lexer_type;

    typedef custom_lexer<lexer_type>::iterator_type iterator_type;

    custom_lexer<lexer_type> my_lexer; 
    custom_grammar<iterator_type, custom_lexer<lexer_type>::lexer_def> my_grammar(my_lexer);

    char const* first = test.c_str();
    char const* last = &first[test.size()];

    lexer_type::iterator_type iter = my_lexer.begin(first, last);
    lexer_type::iterator_type end = my_lexer.end();

    bool r = qi::phrase_parse(iter,end,my_grammar, qi::in_state( "WS" )[ my_lexer.self ]);

    BOOST_CHECK(r);
}

我的假设是返回 true 因为空格被跳过 - 因为 auf qi::in_state("WS").真的吗？此外，我知道如何为空格输出额外的标记 - 但是我不知道在 qi::in_stat 现在所在的位置放置什么 - 没有它，它就无法工作。

关于结构我可以改进的任何想法？为什么调试输出这么有趣？

<ges>
  <try>[]</try>
  <success></success>
  <attributes>[]</attributes>
</ges>

感谢您的帮助。

问候

托比亚斯

【问题讨论】：

parse 函数的返回值不取决于整个输入是否已被消耗，仅取决于是否无法解析您所询问的内容。在这种情况下，您要求获取整数或浮点数。由于解析了一个整数，因此它返回 true。为了检查整个输入是否已被解析，您可以检查iter 和end 是否相等或使用qi::eoi，如sehe 解释的here。该调试输出的原因是您对token_type 的定义。 lex::omit->boost::mpl::vector.
谢谢。 qi::eoi 'trick' 很棒。如何定义令牌类型才能在输出中看到有意义的内容？
typedef lex::lexertl::token<char const*, boost::mpl::vector<int,float,std::string>, boost::mpl::true_> token_type;。如果 qi::eoi 技巧有帮助，您可能应该支持 sehe 的答案。
哦啊哈。刷新页面有帮助:) 我添加了一个答案。它还演示了如何使用lex::tokenize_and* 系列 API。 @llonesmiz 关于 token_type 的观点是正确的，我忘了提。
我可能会 - 一旦我阅读并尝试了它。他昨天帮我解决了一个问题——所以我毫不怀疑。

标签： c++ boost boost-spirit boost-spirit-qi boost-spirit-lex

【解决方案1】：

您的解析器没有失败，但它也没有“默默地”跳过空格（无论如何，它只解析一个 非空格 标记）。

事实上，*phrase_parse 系列 Spirit API 的一个属性是它可能不匹配完整的输入。事实上，这就是它通过引用获取第一个迭代器的原因：解析后迭代器将指示解析停止的位置。

我已经更改了一些位，因此您可以通过在 lexer_tokens 上使用 lex::tokenize_and_phrase_parse 而不是 qi::phrase_parse 轻松访问源迭代器：

Iterator first = test.c_str();
Iterator last = &first[test.size()];

bool r = lex::tokenize_and_phrase_parse(first,last,my_lexer,my_grammar,qi::in_state( "WS" )[ my_lexer.self ]);

std::cout << std::boolalpha << r << "\n";
std::cout << "Remaining unparsed: '" << std::string(first,last) << "'\n";

输出是：

Remaining unparsed: '56'

这是一个完整的工作示例（注意我还直接将语法类的第二个参数更改为 Skipper，这对于 Spirit 语法更典型）：

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>

namespace qi  = boost::spirit::qi;
namespace lex = boost::spirit::lex;

enum LexerIDs { ID_IDENTIFIER, ID_WHITESPACE, ID_INTEGER, ID_FLOAT, ID_PUNCTUATOR };

template <typename Lexer>
struct custom_lexer : lex::lexer<Lexer>
{
    custom_lexer()
        : identifier    ("[a-zA-Z_][a-zA-Z0-9_]*")
        , white_space   ("[ \\t\\n]+")
        , integer_value ("[1-9][0-9]*")
        , hex_value     ("0[xX][0-9a-fA-F]+")
        , float_value   ("[0-9]*\\.[0-9]+([eE][+-]?[0-9]+)?")
        , float_value2  ("[0-9]+\\.([eE][+-]?[0-9]+)?")
        , punctuator    ("\\[|\\]|\\(|\\)|\\.|&>|\\*\\*|\\*|\\+|-|~|!|\\/|%|<<|>>|<|>|<=|>=|==|!=|\\^|&|\\||\\^\\^|&&|\\|\\||\\?|:|,")// [ ] ( ) . &> ** * + - ~ ! / % << >> < > <= >= == != ^ & | ^^ && || ? : ,
    {
        using boost::spirit::lex::_start;
        using boost::spirit::lex::_end;

        this->self.add
            (identifier   , ID_IDENTIFIER)
          /*(white_space  , ID_WHITESPACE)*/
            (integer_value, ID_INTEGER)
            (hex_value    , ID_INTEGER)
            (float_value  , ID_FLOAT)
            (float_value2 , ID_FLOAT)
            (punctuator   , ID_PUNCTUATOR);

        this->self("WS") = white_space;
    }
    lex::token_def<std::string> identifier;
    lex::token_def<lex::omit>   white_space;
    lex::token_def<int>         integer_value;
    lex::token_def<int>         hex_value;
    lex::token_def<double>      float_value;
    lex::token_def<double>      float_value2;
    lex::token_def<>            punctuator;
};

template< typename Iterator, typename Skipper>
struct custom_grammar : qi::grammar<Iterator, Skipper>
{

    template< typename TokenDef >
    custom_grammar(const TokenDef& tok) : custom_grammar::base_type(ges)
    {
        ges = qi::token(ID_INTEGER) | qi::token(ID_FLOAT);
        BOOST_SPIRIT_DEBUG_NODE(ges);
    }
    qi::rule<Iterator, Skipper > ges;
};

int main()
{
    std::string test("1234 56");

    typedef char const* Iterator;
    typedef lex::lexertl::token<Iterator, lex::omit, boost::mpl::true_> token_type;
    typedef lex::lexertl::lexer<token_type> lexer_type;
    typedef qi::in_state_skipper<custom_lexer<lexer_type>::lexer_def> skipper_type;

    typedef custom_lexer<lexer_type>::iterator_type iterator_type;

    custom_lexer<lexer_type> my_lexer; 
    custom_grammar<iterator_type, skipper_type> my_grammar(my_lexer);

    Iterator first = test.c_str();
    Iterator last = &first[test.size()];

    bool r = lex::tokenize_and_phrase_parse(first,last,my_lexer,my_grammar,qi::in_state( "WS" )[ my_lexer.self ]);

    std::cout << std::boolalpha << r << "\n";
    std::cout << "Remaining unparsed: '" << std::string(first,last) << "'\n";
}

【讨论】：

是否需要这样使用white_space令牌？我的意思是命名它，然后引用名称？您在其他地方也建议了类似的方法（stackoverflow.com/questions/13361519/…）。虽然我尝试通过成员变量使用它，但它没有用。还需要使用qi::in_state吗？文档说我们应该为船长提供解析器。那么为什么不是令牌本身呢？毕竟它在语法中被用作解析器......
@AdamBadura 你能分享一下你有什么吗，也许再发个问题？
你去：stackoverflow.com/questions/39468278/…
如果可以的话，还可以再来一个：stackoverflow.com/questions/39468928/…