您希望解析器生成的只是“单词 id”的序列(我们称它们为原子)。
只有能够推动语义操作的函子才需要“了解”映射。
我将在这里稍微简化您的数据结构:
using AtomId = size_t;
using Atom = std::string_view; // or boost::string_view
struct mapping {
std::map<Atom, AtomId> by_word;
std::map<AtomId, Atom> by_id;
};
关于那个语义动作
您可以阅读有关Anatomy Of Spirit Semantic Actions 的信息。
如果你想使用综合的、本地的、公开的或继承的属性,你需要解码上下文参数。对此最好的处理方法仍然是这个答案:boost spirit semantic action parameters
但是,如果你看过它,你会发现它不是很方便。相反,我建议留在 Phoenix 域中(_1、_val、_pass、_r1 和 _a 之类的东西神奇地具有预期的含义,而不必知道如何解决它们上下文)。
在这种情况下,你会希望你的函数是这样的:
struct convert_f {
mapping &m_ref;
using Range = boost::iterator_range<It>;
AtomId operator()(Range const& text) const {
Atom atom{&*text.begin(), text.size()};
auto& left = m_ref.by_word;
auto& right = m_ref.by_id;
auto it = left.find(atom);
if (it != left.end())
return it->second;
else {
const auto iID = left.size();
left.emplace (atom, iID);
right.emplace(iID, atom);
return iID;
}
}
};
boost::phoenix::function<convert_f> convert;
您本可以将Range 设为std::string,但我提前考虑了,由于您将完整文件读入向量,您可以根据原始源迭代器范围使用string_view,以避免复制任何事物。这也消除了在两个地图中存储相同的 std::string 的令人毛骨悚然的冗余¹。
¹ 但请参阅新的“奖金”部分
一些不同的问题点
- BUG:如果您希望
+char_ 仅匹配连续字符,请确保将其包装在lexeme[] 中(因此它不能静默跳过空格)或者当然使规则隐式地进行词位(参见Boost spirit skipper issues)。
- BUG:不要使用
+char_,除非你想解析/anything/ 在你的情况下,你想要连续的非空格,所以至少让它+qi::graph
- BUG:从
std::cin 读取数据时,您已经跳过了空格,因此所有输入将再次变为大字。首先使用std::noskipws 或使用std::istreambuf_iterator 而不是std::istream_iterator。微妙,我知道。
- 不要暴露你的船长,除非你想让调用者改变它
我可能忘记了更多步骤,但现在,让我们忘记这些并放一个演示:
演示
Live On Coliru
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <string_view> // or <boost/utility/string_view.hpp>
#include <iostream>
#include <map>
using AtomId = size_t;
using Atom = std::string_view; // or boost::string_view
using Atoms = std::vector<AtomId>;
struct mapping {
std::map<Atom, AtomId> by_word;
std::map<AtomId, Atom> by_id;
};
namespace qi = boost::spirit::qi;
template <typename It>
struct parser : qi::grammar<It, Atoms()> {
parser(mapping &r) : parser::base_type(start), convert({r}) {
using namespace qi;
// we don't expose the skipper anymore, so we specify it at toplevel
start = skip(ascii::space)[ *name ];
name = raw[ +graph ] [_val = convert(_1)];
}
private:
qi::rule<It, Atoms()> start;
qi::rule<It, AtomId()> name;
struct convert_f {
mapping &m_ref;
using Range = boost::iterator_range<It>;
AtomId operator()(Range const& text) const {
Atom atom{&*text.begin(), text.size()};
auto& left = m_ref.by_word;
auto& right = m_ref.by_id;
auto it = left.find(atom);
if (it != left.end())
return it->second;
else {
const auto iID = left.size();
left.emplace (atom, iID);
right.emplace(iID, atom);
return iID;
}
}
};
boost::phoenix::function<convert_f> convert;
};
int main() {
using It = std::string::const_iterator;
std::string const input { std::istreambuf_iterator<char>(std::cin), {} };
mapping sMapping;
parser<It> const sParser(sMapping);
if (qi::parse(input.begin(), input.end(), sParser)) {
std::cout << "Parsed " << sMapping.by_id.size() << " unique atoms\n";
for (auto& [atom, id] : sMapping.by_word) {
std::cout << atom << "(" << id << ")\n";
}
std::cout << "\n";
} else {
std::cout << "Parse failed\n";
return 1;
}
}
打印(用于current post text):
Parsed 282 unique atoms
!=(153)
"know(34)
"word(19)
##(63)
&m_ref;(135)
(atom,(161)
(it(152)
(let's(21)
(see(230)
(so(220)
(where(111)
**<kbd>[Live(279)
,(78)
//(50)
/anything/(236)
0.(208)
=(46)
About(64)
Action(67)
Actions](http://boost-spirit.com/home/2010/03/03/the-anatomy-of-semantic-actions-in-qi/).(75)
Atom(48)
Atom>(60)
AtomId(45)
AtomId>(57)
BUG:(209)
Coliru]()</kbd>**(281)
DEMO(278)
However,(92)
I(174)
I'd(105)
I'm(37)
If(76)
In(129)
Instead,(104)
OR(225)
Of(73)
On(280)
Only(25)
Phoenix(109)
Points(207)
Problem(206)
Range(136)
Semantic(66)
Some(204)
Spirit(74)
Still(86)
Subtle,(261)
That(65)
There(0)
This(193)
Use(255)
Varied(205)
What(11)
You(68)
[Anatomy(72)
`+char_`(211)
`+qi::graph`(241)
`Range`(171)
`_1`,(114)
`_a`(119)
`_pass`,(116)
`_r1`(117)
`_val`,(115)
`lexeme[]`(219)
`std::cin`(246)
`std::istream_iterator`.(260)
`std::istreambuf_iterator`(258)
`std::noskipws`(256)
`std::string`(200)
`std::string`,(172)
`string_view`(183)
a(40)
about(71)
about"(35)
action(32)
address(127)
again.(254)
ahead,(177)
all(249)
already(247)
also(194)
and(118)
answer:(90)
anything.(192)
at(96)
atom);(164)
atoms).(24)
atom{&*text.begin(),(142)
attribute(9)
attributes,(81)
auto(149)
auto&(144)
avoid(190)
based(184)
be(132)
become(251)
best(87)
big(252)
binding(4)
bit(41)
boost::iterator_range<It>;(137)
boost::phoenix::function<convert_f>(167)
boost::string_view(52)
but(173)
by_id;(61)
by_word;(58)
call(22)
caller(266)
can(69)
cannot(221)
case,(130)
change(267)
chars,(215)
complexity(42)
const(141)
const&(139)
context(84)
context).(128)
contiguous(214)
convenient.(103)
convert;(168)
convert_f(134)
copying(191)
could(169)
course(226)
creepy(196)
data(244)
decode(83)
demo:(277)
domain(110)
don't(232)
drop(276)
else(157)
expect(210)
expose(263)
exposed(8)
file(180)
find(99)
first(257)
for(265)
forget(275)
forgot(269)
from(245)
fuel(29)
full(179)
function(131)
functor(26)
going(38)
have(82)
having(124)
here:(43)
how(126)
https://stackoverflow.com/questions/17072987/boost-spirit-skipper-issues/17073965#17073965).(231)
https://stackoverflow.com/questions/3066701/boost-spirit-semantic-action-parameters/3067881#3067881(91)
iID(158)
iID);(162)
iID;(165)
ids"(20)
if(93)
implicitly(228)
in(108)
inherited(80)
input(250)
inside(201)
instead(259)
intended(121)
into(181)
is(1)
it(150)
it's(100)
it,(97)
it->second;(156)
iterator(188)
just(16)
know(125)
know.(262)
least(240)
left(145)
left.emplace(160)
left.end())(154)
left.find(atom);(151)
left.size();(159)
let's(274)
lexeme(229)
like(113)
like:(133)
little(2)
local,(79)
looked(95)
m_ref.by_id;(148)
m_ref.by_word;(146)
made(170)
magically(120)
make(216)
map(6)
mapping(54)
mappings.(36)
maps.(203)
match(212)
mean(234)
meanings,(122)
more(271)
needs(33)
non-space,(238)
not(101)
now,(273)
of(18)
on(185)
only(213)
operator()(Range(138)
or(51)
parameter.(85)
parse(235)
parser(14)
probably(268)
produce(15)
range,(189)
raw(186)
read(70)
reading(243)
redundancy(197)
removes(195)
return(155)
right(147)
right.emplace(iID,(163)
rule(227)
same(199)
semantic(31)
sequence(17)
sidestep(39)
silently)(224)
since(178)
size_t;(47)
skip(222)
skipper(264)
so(239)
some(270)
source(187)
stay(107)
std::map<Atom,(56)
std::map<AtomId,(59)
std::string_view;(49)
steps,(272)
storing(198)
stretches(237)
struct(53)
suggest(106)
sure(217)
synthesized(77)
text)(140)
text.size()};(143)
that(27)
the(5)
them(23)
things(112)
thinking(176)
this(89)
to(7)
treatment(88)
two(202)
type.(10)
unless(233)
use(3)
using(44)
vector,(182)
very(102)
want(13)
was(175)
when(242)
whitespace,(248)
whitespaces(223)
will(28)
without(123)
word(253)
wrap(218)
you(12)
you'll(98)
you've(94)
your(30)
{(55)
}(166)
};(62)
哦,我忘了实际存储Atoms:
Live On Coliru
Atoms idlist;
if (qi::parse(input.begin(), input.end(), sParser, idlist)) {
std::cout << "Parsed " << sMapping.by_id.size() << " unique atoms\n";
for (AtomId id : idlist) {
std::cout << "'" << sMapping.by_id.at(id) << "' ";
}
std::cout << "\n";
} else {
// ...
打印开头类似的东西:
Parsed 282 unique atoms
'There' 'is' 'little' 'use' 'binding' 'the' 'map' 'to' 'the' 'exposed' 'attribute' 'type.' 'What' 'you' 'want' 'the' ...
奖金
- 使用 Boost Bimap 而不是手动滚动这两个地图。这使事情始终保持同步,并且代码缩短了大约 15 行:
Live On Coliru
using mapping = boost::bimap<Atom, AtomId>;
// ...
AtomId convert_f::operator()(Range const& text) const {
Atom atom{&*text.begin(), text.size()};
return m_ref.left.insert({atom, m_ref.size()}).first->second;
}
然后在用法中:
std::cout << "Parsed " << sMapping.size() << " unique atoms\n";
for (AtomId id : idlist) {
std::cout << "'" << sMapping.right.at(id) << "' ";
}