【问题标题】:Parse custom HTML list tags in C# or Java在 C# 或 Java 中解析自定义 HTML 列表标签
【发布时间】:2015-02-14 21:16:48
【问题描述】:

我有一些这样的文字:

This is a simple line
[olist]
    [#]This is line 1
    [#]This is line 2
        [olist]
            [#]This is line 2.1
            [#]This is line 2.2
            [#]This is line 2.3
    and it continues here
        [/olist]
    [#]This is line 3
[/olist]
Another line

如何在 C# 中将其解析为 HTML,如下所示

This is a simple line
<ol>
    <li>This is line 1</li>
    <li>This is line 2
        <ol>
            <li>This is line 2.1</li>
            <li>This is line 2.2</li>
            <li>This is line 2.3
    and it continues here</li>
        </ol>
    </li>
    <li>This is line 3</li>
</ol>
Another line

我目前正在拆分和连接,但未正确处理子列表。

更新:- 示例代码

这就是我目前正在做的事情。

var html = ReplaceList(customHtml,"olist","ol");

private static string ReplaceList(string text, string key, string tag)
{
    var itemTmpl = GetListEntry(text, key);
    while (itemTmpl != null)
    {
        var buf = new StringBuilder();
        var arr = itemTmpl.Split(new[] { "[#]" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (var str in arr)
        {
            if (!string.IsNullOrWhiteSpace(str))
                buf.AppendFormat("<li>{0}</li>", str.Trim());
        }

        var content = string.Format("<{0}>{1}</{0}>", tag, buf);


        text = text.SubstringBefore("[" + key + "]") + content +
                        text.SubstringAfter("[/" + key + "]");

        itemTmpl = GetListEntry(text, key);
    }

    return text;
}

private static string GetListEntry(string text, string key)
{
    var tag1 = string.Format("[{0}]", key);
    var tag2 = string.Format("[/{0}]", key);

    var start = text.IndexOf(tag1, StringComparison.Ordinal);
    var end = (start > -1) ? text.IndexOf(tag2, start, StringComparison.Ordinal) : -1;

    if (start < 0 || end <= start)
        return null;

    var result = text.Substring(start + tag1.Length, end - start - tag1.Length);

    return result;
}

注意 一些列表项跨越多行,还可能包含换行符

【问题讨论】:

  • 到目前为止你做了什么?你能提供你的代码示例吗?
  • 您似乎正在使用原始字符串工具生成解析器。如果您的用例比您在此处显示的最小部分更复杂,我建议您考虑编写一个实际的解析器。

标签: c# regex custom-tags


【解决方案1】:

您必须先将其解析为某个抽象树,然后从抽象树中组合结果。 即:

public interface IElement
{
  void AddElement(IElement element);
  IElement Parent { get; }
}

class OlElement : IElement
{
  public IList<LiElement> Elements { get; set; }
  public IElement Parent { get; set; }

  public OlElement(IElement parent)
  {
    Parent = parent;
    Elements = new List<LiElement>();
  }

  public void AddElement(IElement element)
  {
    Elements.Add((LiElement)element);
  }

  public override string ToString()
  {
    var builder = new StringBuilder();
    builder.AppendLine("<ol>");
    foreach(var child in Elements)
    {
      builder.AppendLine(child.ToString());
    }
    builder.AppendLine("</ol>");
    return builder.ToString();
  }
}

class LiElement : IElement
{
  public string Text { get; set; }
  public IElement Parent { get; set; }
  public IList<OlElement> Elements { get; set; }

  public LiElement(IElement parent, string text)
  {
    Parent = parent;
    Text = text;
    Elements = new List<OlElement>();
  }

  public void AddElement(IElement element)
  {
    Elements.Add((OlElement)element);
  }

  public override string ToString()
  {
    var builder = new StringBuilder();
    builder.Append("<li>");
    builder.Append(Text);
    foreach (var child in Elements)
    {
      builder.AppendLine(child.ToString());
    }
    builder.AppendLine("</li>");
    return builder.ToString();
  }
}

得到结果:

const string text = @"[olist]
[#]This is line 1
[#]This is line 2
    [olist]
        [#]This is line 2.1
        [#]This is line 2.2
        [#]This is line 2.3
    [/olist]
[#]This is line 3
[/olist]";
var regex = new Regex(@"^\s*\[(?<tag>[^\]]+)\](?<text>.*)$");
var builder = new StringBuilder();
var root = new OlElement(null);
var currentElement = (IElement)root;
using (var reader = new StringReader(text))
{
  string line;
  while ((line = reader.ReadLine()) != null)
  {
    var match = regex.Match(line);
    if (match.Success)
    {
      switch (match.Groups["tag"].Value)
      {
        case "#":
          if (currentElement is OlElement)
          {
            var child = new LiElement(currentElement, match.Groups["text"].Value);
            currentElement.AddElement(child);
            currentElement = child;
            break;
          }
          if (currentElement is LiElement)
          {
            var child = new LiElement(currentElement.Parent, match.Groups["text"].Value);
            currentElement.Parent.AddElement(child);
            currentElement = child;
          }
          break;
        case "olist":
          if (currentElement == root)
          {
            break;
          }
          if (currentElement is LiElement)
          {
            var child = new OlElement(currentElement);
            currentElement.AddElement(child);
            currentElement = child;
          }
          break;
        case "/olist":
          if (currentElement is LiElement)
          {
            currentElement = currentElement.Parent.Parent;
            break;
          }
          if (currentElement is OlElement)
          {
            currentElement = currentElement.Parent;
          }
          break;
        default:
          break;
      }
    }
  }
}
var result = root.ToString();

【讨论】:

  • 很好,但是如果某些列表项跨越多行并且还可能包含换行符,它会失败。所以我添加了: if (currentElement is LiElement) { var li = currentElement as LiElement; li.Text += "\r\n" + 行; }
【解决方案2】:

考虑以下方法(注意它在确定标签时“又快又脏”)。

非常简单 - 只需逐行阅读您的文本并进行转换(带有一些前瞻和计算子列表的深度级别)。

string src = @"[olist]
    [#]This is line 1
    [#]This is line 2
        [olist]
            [#]This is line 2.1
                [olist]
                    [#]This is line 2.1.1
                    [#]This is line 2.1.2
                [/olist]
            [#]This is line 2.2
            [#]This is line 2.3
        [/olist]
    [#]This is line 3
[/olist]";


var sb = new StringBuilder();
var lines = src.Split(new string[] {Environment.NewLine}, StringSplitOptions.RemoveEmptyEntries);
int i = 0;
int innerListsCount = 0;

while (i < lines.Length)
{
    string line = lines[i];
    if (line.EndsWith("[olist]"))
        sb.AppendLine(line.Replace("[olist]", "<ol>"));
    else if (line.EndsWith("[/olist]"))
    {
        sb.AppendLine(line.Replace("[/olist]", "</ol>"));
        if (innerListsCount > 0)
        {
            for (int j = 0; j <= innerListsCount; j++)
                sb.Append("    ");

            sb.AppendLine("</li>");
        }

        innerListsCount--;
    }
    else if (line.Trim().StartsWith("[#]"))
    {
        sb.Append(line.Replace("[#]", "<li>"));

        if (i < lines.Length && lines[i + 1].EndsWith("[olist]"))
        {
            innerListsCount++;
            sb.AppendLine();
        }
        else
            sb.AppendLine("</li>");
    }

    i++;
}

Console.WriteLine(sb.ToString());

输出看起来完全符合您的要求:

<ol>
    <li>This is line 1</li>
    <li>This is line 2
        <ol>
            <li>This is line 2.1
                <ol>
                    <li>This is line 2.1.1</li>
                    <li>This is line 2.1.2</li>
                </ol>
            </li>
            <li>This is line 2.2</li>
            <li>This is line 2.3</li>
        </ol>
        </li>
    <li>This is line 3</li>
</ol>

【讨论】:

  • 某些列表项跨越多行,还可能包含换行符
  • @SoniAli 好吧,如果您提供的条件不完整 - 结果可能远非预期......
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2015-01-20
  • 2023-04-09
  • 2014-06-06
  • 2017-05-18
  • 1970-01-01
相关资源
最近更新 更多