【问题标题】:AWS Textract - Analyzing PDF file with LambdaAWS Textract - 使用 Lambda 分析 PDF 文件
【发布时间】:2022-01-06 07:36:44
【问题描述】:

我很难尝试在 Lambda 中使用 Textract 来分析带有 javascript 的 PDF 文档。我真的很感谢这里的一些帮助。

这是我的代码:

const AWS = require("aws-sdk");
AWS.config.update({ region: process.env.AWS_REGION });

const textract = new AWS.Textract();

exports.handler = async (event, context) => {
  const bucket = event.Records[0].s3.bucket.name;
  const key = decodeURIComponent(
    event.Records[0].s3.object.key.replace(/\+/g, " ")
  );
  var textractParams = {
    DocumentLocation: {
      S3Object: {
        Bucket: bucket,
        Name: key,
      },
    },
    FeatureTypes: ["FORMS"]
  };
  
  try {
      const textractAnalysis = await textract.startDocumentAnalysis(textractParams);
      
      var analysisParams = {
        JobId: textractAnalysis.JobId
      };
      
      const data = await textract.getDocumentAnalysis(analysisParams);
      console.log(data);
  } catch (e) {
    console.log(e);
  }
};

这是我得到的回复:

  domain: null,
  service: Service {
    config: Config {
      credentials: [EnvironmentCredentials],
      credentialProvider: [CredentialProviderChain],
      region: 'eu-west-1',
      logger: null,
      apiVersions: {},
      apiVersion: null,
      endpoint: 'textract.eu-west-1.amazonaws.com',
      httpOptions: [Object],
      maxRetries: undefined,
      maxRedirects: 10,
      paramValidation: true,
      sslEnabled: true,
      s3ForcePathStyle: false,
      s3BucketEndpoint: false,
      s3DisableBodySigning: true,
      s3UsEast1RegionalEndpoint: 'legacy',
      s3UseArnRegion: undefined,
      computeChecksums: true,
      convertResponseTypes: true,
      correctClockSkew: false,
      customUserAgent: null,
      dynamoDbCrc32: true,
      systemClockOffset: 0,
      signatureVersion: 'v4',
      signatureCache: true,
      retryDelayOptions: {},
      useAccelerateEndpoint: false,
      clientSideMonitoring: false,
      endpointDiscoveryEnabled: undefined,
      endpointCacheSize: 1000,
      hostPrefixEnabled: true,
      stsRegionalEndpoints: 'legacy'
    },
    isGlobalEndpoint: false,
    endpoint: Endpoint {
      protocol: 'https:',
      host: 'textract.eu-west-1.amazonaws.com',
      port: 443,
      hostname: 'textract.eu-west-1.amazonaws.com',
      pathname: '/',
      path: '/',
      href: 'https://textract.eu-west-1.amazonaws.com/'
    },
    _events: { apiCallAttempt: [Array], apiCall: [Array] },
    MONITOR_EVENTS_BUBBLE: [Function: EVENTS_BUBBLE],
    CALL_EVENTS_BUBBLE: [Function: CALL_EVENTS_BUBBLE],
    _clientId: 3
  },
  operation: 'getDocumentAnalysis',
  params: { JobId: undefined },
  httpRequest: HttpRequest {
    method: 'POST',
    path: '/',
    headers: {
      'User-Agent': 'aws-sdk-nodejs/2.1001.0 linux/v14.18.1 exec-env/AWS_Lambda_nodejs14.x'
    },
    body: '',
    endpoint: {
      protocol: 'https:',
      host: 'textract.eu-west-1.amazonaws.com',
      port: 443,
      hostname: 'textract.eu-west-1.amazonaws.com',
      pathname: '/',
      path: '/',
      href: 'https://textract.eu-west-1.amazonaws.com/',
      constructor: [Function]
    },
    region: 'eu-west-1',
    _userAgent: 'aws-sdk-nodejs/2.1001.0 linux/v14.18.1 exec-env/AWS_Lambda_nodejs14.x'
  },
  startTime: 2022-01-05T22:37:00.269Z,
  response: Response {
    request: [Circular *1],
    data: null,
    error: null,
    retryCount: 0,
    redirectCount: 0,
    httpResponse: HttpResponse {
      statusCode: undefined,
      headers: {},
      body: undefined,
      streaming: false,
      stream: null
    },
    maxRetries: 3,
    maxRedirects: 10
  },
  _asm: AcceptorStateMachine {
    currentState: 'validate',
    states: {
      validate: [Object],
      build: [Object],
      afterBuild: [Object],
      sign: [Object],
      retry: [Object],
      afterRetry: [Object],
      send: [Object],
      validateResponse: [Object],
      extractError: [Object],
      extractData: [Object],
      restart: [Object],
      success: [Object],
      error: [Object],
      complete: [Object]
    }
  },
  _haltHandlersOnError: false,
  _events: {
    validate: [
      [Function (anonymous)],
      [Function],
      [Function: VALIDATE_REGION],
      [Function: BUILD_IDEMPOTENCY_TOKENS],
      [Function: VALIDATE_PARAMETERS]
    ],
    afterBuild: [
      [Function: COMPUTE_CHECKSUM],
      [Function],
      [Function: SET_CONTENT_LENGTH],
      [Function: SET_HTTP_HOST]
    ],
    restart: [ [Function: RESTART] ],
    sign: [ [Function (anonymous)], [Function], [Function] ],
    validateResponse: [ [Function: VALIDATE_RESPONSE], [Function (anonymous)] ],
    send: [ [Function] ],
    httpHeaders: [ [Function: HTTP_HEADERS] ],
    httpData: [ [Function: HTTP_DATA] ],
    httpDone: [ [Function: HTTP_DONE] ],
    retry: [
      [Function: FINALIZE_ERROR],
      [Function: INVALIDATE_CREDENTIALS],
      [Function: EXPIRED_SIGNATURE],
      [Function: CLOCK_SKEWED],
      [Function: REDIRECT],
      [Function: RETRY_CHECK],
      [Function: API_CALL_ATTEMPT_RETRY]
    ],
    afterRetry: [ [Function] ],
    build: [ [Function: buildRequest] ],
    extractData: [ [Function: extractData], [Function: extractRequestId] ],
    extractError: [ [Function: extractError], [Function: extractRequestId] ],
    httpError: [ [Function: ENOTFOUND_ERROR] ],
    success: [ [Function: API_CALL_ATTEMPT] ],
    complete: [ [Function: API_CALL] ]
  },
  emit: [Function: emit],
  API_CALL_ATTEMPT: [Function: API_CALL_ATTEMPT],
  API_CALL_ATTEMPT_RETRY: [Function: API_CALL_ATTEMPT_RETRY],
  API_CALL: [Function: API_CALL]
}

但是当我更改两个文本提取函数并添加这样的承诺时:

const textractAnalysis = await textract.startDocumentAnalysis(textractParams).promise();
const data = await textract.getDocumentAnalysis(analysisParams).promise();

然后我得到这个响应:

{ JobStatus: 'IN_PROGRESS', AnalyzeDocumentModelVersion: '1.0' }

我也尝试使用 Textract Client,但更糟糕的是因为我根本无法导入它。

提前致谢!

【问题讨论】:

  • 大概在你调用getDocumentAnalysis的时候异步分析还没有完成。响应中存在什么JobStatus
  • @jarmod 我更新了帖子,我收到了一些回复,但仍然没有任何积极意义。
  • 不确定您还希望在这里看到什么。这是一个异步任务。它正在进行中。您必须等到它完成,此时文本提取结果将提供给您。如果您不想轮询,可以订阅提供的 SNS 主题以获得通知。
  • @jarmod 在创建由 SNS 触发的单独 lambda 后,我设法获得了响应块。谢谢!

标签: javascript node.js amazon-web-services aws-lambda amazon-textract


【解决方案1】:

.startDocumentAnalysis 的文档显示您只获得 jobId 作为响应。

文本分析完成后,Amazon Textract 会发布一个完成 状态到 Amazon Simple Notification Service (Amazon SNS) 主题 您在 NotificationChannel 中指定的。

要获得文本分析操作的结果,首先检查 发布到 Amazon SNS 主题的状态值为 SUCCEEDED。如果 因此,调用 GetDocumentAnalysis,并从 对 StartDocumentAnalysis 的初始调用。有关详细信息,请参阅 文档文本分析。

如果您想要一种或多或少同步的方式,您可以使用.analyzeDocument

【讨论】:

  • 同步请求不是 Lambda 函数的好选择。
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2021-07-09
  • 1970-01-01
  • 2022-10-14
  • 1970-01-01
  • 1970-01-01
  • 2022-01-09
相关资源
最近更新 更多