通过 JavaScript 检测文档中的希伯来语单词答案

【问题标题】：Detecting Hebrew words in document via JavaScript通过 JavaScript 检测文档中的希伯来语单词
【发布时间】：2010-02-10 08:38:42
【问题描述】：

在 Web 开发方面（虽然不是一般的编程），我是一个新手，所以请原谅任何不正确的术语。

我想构建一个脚本，当添加到 HTML 页面时，它会检测页面中的每个希伯来语单词并将该单词转换为 HTML 元素，例如进入带有标题的超链接。

所以，如下：

<p>ראש הלשכה</p>

转化为：

<p><a title="word 1" href="#">הלשכה</a> <a title="word 2" href="#">ראש</a></p>

有意义吗？

所以，我想首要任务是检测页面中的希伯来语单词。我该怎么做呢？除了浏览 jQuery 文档之外，我不知道从哪里开始。

【问题讨论】：

HTML 会是 Unicode 吗？ UTF-8，还是可以是任何编码？
好问题。让我们让它变得简单，说 UTF-8。我希望它可以处理 haaretz.co.il 之类的文档

标签： javascript dom hebrew

【解决方案1】：

在字符串中搜索希伯来语单词相当简单。使用匹配连续的希伯来代码点序列的正则表达式：

/[\u05D0-\u05FF]+/

由于 JS 支持函数式编程，我们可以轻松编写自己的函数来遍历文档树，在每个文本节点上调用一个函数。首先，一点脚手架。

if (! window.assert) {
    window.dbgLvl = 1; // change this to 0 for production release
    window.assert=function(succeeded, msg) {
        if (dbgLvl && !succeeded) {
            if (!msg) msg = 'assertion failed';
            throw msg;
        }
    }
}

接下来，我们定义一个将字符串拆分为数组的方法，包括输出中的分隔符。

/* String.separate is like String.split, but the result includes the 
   separators.

   These implementations of 'String.separate' will work for our purposes,
   but are buggy in general, due to differences in the implementation of
   String.split.

   The two misbehaviors we correct are including neither grouped patterns 
   nor empty strings in the result, though the latter is only corrected
   when the missing empty string is at the start or the end.
*/
if ('-'.split(/(-)/).length & 1) {
    assert('a'.split(/a/).length, 'split includes grouping but not empty strings');
    // split includes groups in result
    String.prototype.separate = function (separator) {
        if (typeof separator == 'string') {
            if (separator.charAt(0) != '(' 
                || separator.charAt(separator.length-1) != ')')
            {
                separator = new RegExp('(' + separator + ')', 'g');
            } else {
                separator = new RegExp(separator, 'g');
            }
        }
        return this.split(separator);
    }
} else {
    if ('a'.split(/a/).length) {
        // empty strings included, grouped aren't 
        String.prototype.separate = function (separator) {
            if (typeof separator == 'string') {
                separator = new RegExp(separator, 'g');
            }
            var fence = this.match(separator);
            if (!fence) {
                return [this];
            }
            var posts = this.split(separator);
            assert(posts.length = fence.length+1);
            var result = [], i;
            for (i=0; i<fence.length; ++i) {
                result.push(posts[i]);
                result.push(fence[i]);
            }
            result.push(posts[i]);
            return result;
        }
    } else {
        // neither empty strings nor groups are included. IE, you suck.
        String.prototype.separate = function (separator) {
            if (typeof separator == 'string') {
                separator = new RegExp(separator, 'g');
            }
            var fence = this.match(separator);
            if (!fence) {
                return [this];
            }
            var posts = this.split(separator);
            if (posts.length <= fence.length) {
                /* missing some posts. Assume that they are the first or 
                   last, though this won't be true in general.
                */
                if (posts.length < fence.length) {
                    posts.unshift('');
                    posts.push('');
                } else {
                    if (this.substring(0, fence[0].length) == fence[0]) {
                        posts.unshift('');
                    } else {
                        posts.push('');
                    }
                }
            }
            var result = [], i;
            for (i=0; i<fence.length; ++i) {
                result.push(posts[i]);
                result.push(fence[i]);
            }
            result.push(posts[i]);
            return result;
        }
    }
}

接下来，一些节点谓词。

if (! window.Node) {
    window.Node={TEXT_NODE: 3};
} else if (typeof Node.TEXT_NODE == 'undefined') {
    Node.TEXT_NODE = 3;
}

function isTextNode(node) {return node.nodeType == Node.TEXT_NODE;}
function hasKids(node) {return node.childNodes && node.childNodes.length;}
function allNodes(node) {return true;}

现在是遍历 DOM 的函数。

/*
  forEachChild: pre-order traversal of document tree. Applies a function to some nodes, determined by the 'which' and 'descendInto' arguments.

Arguments:
  which  (function): Returns true if 'action' should be applied to a node.
  action (function): Takes a node and does something to it.
  parent (Node): The node to start from.
  descendInto (function, optional): By default, forEachChild will descend into every child that itself has children. Place additional restrictions by passing this argument.
*/
var forEachChild = (function() {
        /* the actual implementation is made a local function so that the
           optional parameter can be handled efficiently.
         */
        function _forEachChild(which, action, node, descendInto) {
            for (var child=node.firstChild; child; child=child.nextSibling) {
                if (which(child)) {
                    action(child);
                }
                if (hasKids(child) && descendInto(child)) {
                    _forEachChild(which, action, child, descendInto);
                }
            }
        }
        return function (which, action, node, descendInto) {
            if (!descendInto) {descendInto=allNodes}
            _forEachChild(which, action, node, descendInto);
        }
    })();

function forEachNode(which, action, descendInto) {
    return forEachChild(which, action, document, descendInto);
}

function forEachTextNode(action, descendInto) {
    return forEachNode(isTextNode, action, descendInto);
}

function forEachTextNodeInBody(action, descendInto) {
    return forEachChild(isTextNode, action, document.body, descendInto);
}

最后一组函数用您选择的新节点替换与模式匹配的文本节点中的文本。该组（嗯，wrapText 返回的函数）尚未完全测试跨浏览器兼容性，包括它是否正确处理文本方向。

/* 
   wrapText replaces substrings in a text node with new nodes.

 Arguments:
   pattern (RegExp || string): If a RegExp, must be of the form: '/(...)/g'.
   replace (function): Takes a string and returns a Node to replace the string.

Returns a function that takes a text node.
*/
function wrapText(pattern, replace) {
    return function (node) {
        var chunks = node.nodeValue.separate(pattern);
        if (chunks.length < 2)
            return;
        var wordCount=0;
        var fragment = document.createDocumentFragment();
        var i;
        // don't bother adding first chunk if it's empty.
        if (chunks[0].length) {
            fragment.appendChild(document.createTextNode(chunks[0]));
        }
        for (i=1; i < chunks.length; i+=2) {
            fragment.appendChild(replace(chunks[i])); // †
            fragment.appendChild(document.createTextNode(chunks[i+1])); // ‡
        }
        // clean-up
        assert(i == chunks.length, 'even number of chunks in ['+chunks+'] when it should be odd.');
        /* chunks.length and i will always be odd, thus i == chunks.length
         * when the loop finishes. This means the last element is never
         * missed. 
         * Here's another way of thinking about this. Since the last 
         * (and first) chunk won't match the pattern, it won't be 
         * processed by the line †. The penultimate chunk, however, does
         * match. Assuming the loop condition is correct,the penultimate 
         * chunk must be processed by †, hence the last chunk is 
         * processed by ‡.
         */
        if (! chunks[i-1].length) {
            // last chunk is empty; remove it.
            fragment.removeChild(fragment.lastChild);
        }
        node.parentNode.replaceChild(fragment, node);
    }
}

/*
  createAnchorWrap wraps a string in an anchor node. createAnchorWrap also
  sets the title of the anchor.

Arguments:
  title (string || function, optional): The title for the anchor element. 
      If title is a function, it's called with the string to wrap. If 
      title is a string, wrapper will use a word counter for the title 
      function.

Returns a function that takes a string and returns an anchor element.
 */
function createAnchorWrap(title) {
    if (typeof title == 'string') {
        title=createWordCounter(title);
    } else if (!title) {
        title=createWordCounter();
    }
    return function(word) {
        var a = document.createElement('a');
        a.title=title(word);
        a.appendChild(document.createTextNode(word));
        return a;
    }
}

/*
  createWordCounter creates a word counter, which returns the number of 
  times it's been called (including the current call), prefixed by a string.

Arguments:
  pre (string, optional): prefix for return value.

Returns a function that takes a string (ignored) and returns a string.

 */
function createWordCounter(pre) {
    var wordCount=0;
    if (pre) {
        pre = pre.replace(/ *$/, ' ');
    } else {
        pre = 'word ';
    }
    return function(text) {
        return pre + wordCount;
    }
}

您要做的最后一件事是在（例如）页面底部的负载处理程序或脚本中启动进程。

forEachTextNodeInBody(wrapText(/([\u05D0-\u05FF]+)/g,
                               createAnchorWrap()));

如果要更改标题的前缀，请将createWordCounter(...) 的结果传递给createAnchorWrap。

【讨论】：

好的，这是一个开始。因此，Javascript 内置了对 RegEx 的支持。好的，太好了。现在，关于在 HTML 文档中查找文本的内容...
好的，现在您已经编写了一些 Javascript 函数来遍历树。看起来我可以使用 forEachTextNode(action) 以某种方式将文本元素替换为 achor 元素。好吧。我会看看我能做什么。感谢您迄今为止的帮助。
请注意，使用 JS 库（jQuery、Prototype、MooTools ...）可能仍然是个好主意。
嗯。我尝试了以下方法： "הלשכה".match(new RegExp("/[\u05D0-\u05FF]+/")) 并且返回错误，不匹配。我是不是做错了什么？
JS 支持 RE 字面量。试试"ראש הלשכה".match(/[\u05D0-\u05FF]+/g)。在需要使用 RegExp 构造函数的情况下（基本上，当您需要插入变量时），不要添加分隔符：new RegExp('[' + start + '-' + end + ']')。