C中的连续字符串替换答案

【问题标题】：Consecutive string replacements in CC中的连续字符串替换
【发布时间】：2021-03-13 18:53:27
【问题描述】：

请看下面的代码，它通过循环遍历所有要替换的utf8字符来执行连续的字符/字符串替换；你会提出另一个更高效的构造吗？

static char *utf8[66] =  { "◊",    "⎕",    "⍞",    "⌹",    "⊤",    "⊥",
                           "⌶",    "⌈",    "∪",    "⍕",    "⍎",    "│",
                           "⍟",    "∆",    "∇",    "→",    "←",    "⌊",
                           "┐",    "└",    "─",    "↑",    "↓",    "≡",
                           "⍸",    "⋸",    "∵",    "⌷",    "⍂",    "⌻",
                           "⊣",    "⊢",    "⋄",    "┘",    "┌",    "⍺",
                           "⊂",    "⊃",    "⍝",    "⍲",    "⍴",    "⍱",
                           "⌽",    "⊖",    "○",    "∨",    "⍳",    "⍬",
                           "∈",    "∩",    "⌿",    "⍀",    "≥",    "≤",
                           "≠",    "×",    "÷",    "⍙",    "∘",    "⍵",
                           "⍫",    "⍋",    "⍒",    "¯",    "¨",    NULL    };

static char *ebcdic[66] = { "\x8d", "\x90", "\x91", "\x92", "\x98", "\x9d",
                           "\x9f", "\xa9", "\xac", "\xae", "\xaf", "\xb3",
                           "\xb5", "\xb6", "\xb7", "\xb8", "\xbd", "\xbe",
                           "\xbf", "\xc0", "\xc4", "\xc6", "\xc7", "\xcf",
                           "\xd0", "\xd1", "\xd2", "\xd3", "\xd4", "\xd5",
                           "\xd6", "\xd7", "\xd8", "\xd9", "\xda", "\xe0",
                           "\xe2", "\xe3", "\xe4", "\xe5", "\xe6", "\xe7",
                           "\xe8", "\xe9", "\xea", "\xeb", "\xec", "\xed",
                           "\xee", "\xef", "\xf0", "\xf1", "\xf2", "\xf3",
                           "\xf4", "\xf5", "\xf6", "\xf7", "\xf8", "\xf9",
                           "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", NULL    };

char* convert(char *line) {
  char *buffer1;
  char *buffer2;
  char *tmp;
  int i=0;

  buffer1 = malloc(strlen(line));
  strcpy(buffer1, line);
  while(ebcdic[i]) {
    buffer2 = replace(buffer1, utf8[i], ebcdic[i]);
    free(buffer1);
    buffer1 = malloc(strlen(buffer2));
    strcpy(buffer1, buffer2);
  }
  tmp = malloc(strlen(buffer1 + 1));
  sprintf(tmp, "%s\n", buffer1);

  free(buffer1);
  free(buffer2);
  return tmp;
}

char* replace(const char* s, const char* oldW, const char* newW) {
  char* result;
  int i, cnt = 0;
  int newWlen = strlen(newW);
  int oldWlen = strlen(oldW);

  for (i = 0; s[i] != '\0'; i++) {
    if (strstr(&s[i], oldW) == &s[i]) {
      cnt++;
      i += oldWlen - 1;
    }
  }
  result = (char*)malloc(i + cnt * (newWlen - oldWlen) + 1);
  i = 0;
  while (*s) {
    if (strstr(s, oldW) == s) {
      strcpy(&result[i], newW);
      i += newWlen;
      s += oldWlen;
    } else {
      result[i++] = *s++;
    }
  }
  result[i] = '\0';
  return result;
}

update-001：为 replace() 添加了代码。
update-002：将 for/loop 更改为 while。

感谢您的关注，在这种特殊情况下，我更关心可读性和内存使用情况而不是性能。

【问题讨论】：

(a) 将分配空间的长度增加一以允许空终止符。 (b) 显示replace 的定义。 (c) 不要释放缓冲区并分配新空间，只需重用同一个缓冲区。 (d) 根本不分配缓冲区来复制line；只为新行分配空间。输入可以直接从用户传递的缓冲区中读取，而无需复制。 (e) sprintf(tmp, "%s\n", buffer1); 是荒谬的。只需将所需的数据放入buffer1 （如果需要，为换行符分配更多空间）并返回。 (f) 缓冲区未被释放并被回收。
添加了 replace() 的代码
我建议先写正确的代码，然后再担心效率。 while(ebcdic[i]) 显然是不正确的，它永远不会终止。 malloc(strlen(line)) 显然是不正确的。每次迭代的 mallocs 多于 frees 显然是不正确的。

标签： c loops optimization

【解决方案1】：

我假设您正在尝试编写该代码作为学习经验，否则使用现有工具/库将其刮掉。

当你想转换字符/代码点时，基本算法是这样的：

从输入字符串中获取下一个代码点，转换该代码点（或保持原样），将转换后的代码点存储在输出字符串的末尾。重复。

由于您的输入字符串每个代码点使用一个char，因此获取下一个代码点就像循环输入字符串中的“字符”一样简单。这也意味着代码点转换可以由大小为 256 的简单查找表使用（假设为 8 位 chars）。 utf8 代码点的长度不一定为 1，因此我们必须考虑到这一点。

/* This syntax just means that the array is intialized with 
   ebdic2utf8_lut[0x8d] = "◊", ebdic2utf8_lut[0x90] = "⎕", etc.
   Array elements that are not explicit assigned in the initialization
   list will be initialized to `0` (or NULL) 
   We may treat array elements with value `0` as "keep as is" */

static const char *const ebdic2utf8_lut[256] = {
  [0x8d] = "◊",
  [0x90] = "⎕",
  [0x91] = "⍞",
  [0x92] = "⌹",
  /* Rest of initializations left out for brevity */
};

char * convert(const char *src)
{
   /* Allocate space for empty string*/
   char *dst = calloc(1, 1);
   if (!dst)
      {
         perror("String allocation failed");
         exit(1);
      }

   size_t dst_length = 0;

   while(*src)
     {
        /* We want to lose the sign of `char` for the lut */
        unsigned char ch = *src;

        /* Convert next character */
        const char *utf8 = ebdic2utf8_lut[ch];

        /* If there is no conversion we keep it as is
           But the rest of the function works with strings, so we
           put the input character in a string with length 1 */
        char keep_as_is[2];
        if (!utf8)
          {
            keep_as_is[0] = ch;
            keep_as_is[1] = '\0'; /* Zero termination */
            utf8 = keep_as_is;
          }

        size_t utf8_length = strlen(utf8);

        size_t new_dst_length = dst_length + utf8_length;

        /* Resize destination string to allow for appending 
           (including zero-termination) */
        char *new_dst = realloc(dst, new_dst_length + 1);
        if (!new_dst)
          {
            perror("String resize failed");
            exit(1);
          }
        dst = new_dst;

        /* Append converted character to destination string*/ 
        strcpy(dst + dst_length, utf8);


        dst_length = new_dst_length;
        src++;
     }

   return dst;
} 

int main(void)
{
  char *str = convert("Hello\x90\x91\x92World");

  /* This should print "Hello⎕⍞⌹World", unless you are under Windows.
     Windows and utf-8 doesn't mix very well */
  printf("%s\n", str);
}

此代码未经检查错误，使用风险自负等。

【讨论】：

几个问题：看起来 dst_length 保持为零？（应该是 size_t 吗？）和 char* dst=malloc(1)？
是的，dst_length 应该是 size_t（现已修复）。它的长度不保持为零，而是在循环中更新。其他修复：src 在循环中递增，而不是dst。（dst 仍被realloc 更新）
如果您使用dst=malloc(1) 而不是dst=calloc(1)，则目标字符串在为空时不会以零结尾。稍后strcpy 将添加零终止，但这取决于源字符串不为空，以及进入循环的代码。如果您更喜欢使用malloc，您可以这样做，但是您应该在malloc 之后添加dst[0] = '\0' 以捕获src 为空时的情况。
Calloc() 不接受 2 个参数吗？
请找到以下要点gist.github.com/jibanes/712a8c5ac5640b1730a1dc7da4217678。当我调用 convert("123⍳123") 时，它会在此处打印 d8 d8 d8 d8 d8 d8 d8 d8 d8。

【解决方案2】：

要从 utf8 转换为 ebdic，仍然可以使用查找表。但是从 utf8->ebdic 获取 LUT 是不切实际的，因为它的大小。但我们可以使用 ebdic->utf8 LUT 并循环查找匹配项。

字符串转换的基本算法，基本还是一样的，get 输入字符串中的下一个 utf8 代码点/字符，将其转换为 ebdic，将转换后的字符推送到输出字符串（如果未找到转换，则推送一个字符），从输入字符串中删除前缀，重复。

在进行 utf8->ebdic 转换时，我们必须注意一个 utf8 码位可能是多个字节。所以我们要比较输入字符串前缀中的多个字节，而且我们还必须将输入字符串增加多个字节。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>


static const char *const ebdic2utf8_lut[256] = {
  [0x8d] = "◊", [0x90] = "⎕", [0x91] = "⍞", [0x92] = "⌹",
  [0x98] = "⊤", [0x9d] = "⊥", [0x9f] = "⌶", [0xa9] = "⌈",
  [0xac] = "∪", [0xae] = "⍕", [0xaf] = "⍎", [0xb3] = "│",
  [0xb5] = "⍟", [0xb6] = "∆", [0xb7] = "∇", [0xb8] = "→",
  [0xbd] = "←", [0xbe] = "⌊", [0xbf] = "┐", [0xc0] = "└",
  [0xc4] = "─", [0xc6] = "↑", [0xc7] = "↓", [0xcf] = "≡",
  [0xd0] = "⍸", [0xd1] = "⋸", [0xd2] = "∵", [0xd3] = "⌷",
  [0xd4] = "⍂", [0xd5] = "⌻", [0xd6] = "⊣", [0xd7] = "⊢",
  [0xd8] = "⋄", [0xd9] = "┘", [0xda] = "┌", [0xe0] = "⍺",
  [0xe2] = "⊂", [0xe3] = "⊃", [0xe4] = "⍝", [0xe5] = "⍲",
  [0xe6] = "⍴", [0xe7] = "⍱", [0xe8] = "⌽", [0xe9] = "⊖",
  [0xea] = "○", [0xeb] = "∨", [0xec] = "⍳", [0xed] = "⍬",
  [0xee] = "∈", [0xef] = "∩", [0xf0] = "⌿", [0xf1] = "⍀",
  [0xf2] = "≥", [0xf3] = "≤", [0xf4] = "≠", [0xf5] = "×",
  [0xf6] = "÷", [0xf7] = "⍙", [0xf8] = "∘", [0xf9] = "⍵",
  [0xfa] = "⍫", [0xfb] = "⍋", [0xfc] = "⍒", [0xfd] = "¯",
  [0xfe] = "¨" };


/* Match an utf8 string with prefix in `str` and return the corresponding
   ebdic character */

char utf8lookup(const char *str, size_t *increment)
{
  for (size_t n = 0; n < 256; n++)
    {
      const char *utf8 = ebdic2utf8_lut[n];
      if (utf8)
    {
      size_t len = strlen(utf8);
      if (strncmp(str, utf8, len) == 0)
        {
          *increment = len;
          return (char)n;
        }
    }
    }
  return 0;
}



char * convert_u2e(const char *src)
{
   /* Allocate space for empty string*/
   char *dst = calloc(1, 1);
   if (!dst)
      {
         perror("String allocation failed");
         exit(1);
      }

   size_t dst_length = 0;

   while(*src)
     {

       /* Convert next character */
       size_t src_increment;
       char ch = utf8lookup(src, &src_increment);

       /* If there is no conversion we the first character int `src` as is */
        if (!ch)
          {
        ch = *src;
        src_increment = 1;
          }

        size_t new_dst_length = dst_length + 1;

        /* Resize destination string to allow for appending 
           (including zero-termination) */
        char *new_dst = realloc(dst, new_dst_length + 1);
        if (!new_dst)
          {
            perror("String resize failed");
            exit(1);
          }
        dst = new_dst;

        /* Append converted character to destination string*/ 
        dst[dst_length] = ch;

        dst_length = new_dst_length;
        src += src_increment;
     }

   /* Zero terminate */
   dst[dst_length] = '\0';
   
   return dst;
} 

int main(void)
{
  char *str = convert_u2e("Hello⎕⍞⌹World");

  for (const char *s = str; *s; s++)
    printf("%hhx ", (unsigned char)*s);
  printf("\n");

  free(str);
}

【讨论】：