【发布时间】:2021-12-11 16:57:30
【问题描述】:
我是 Google Cloud Platform 的新手,我正在尝试创建一个 Feature Store 来填充来自 Google Cloud Storage 的 csv 文件中的值。目的是从 Python 的本地笔记本中做到这一点。 我基本上遵循代码here,进行适当的更改,因为我正在使用信用卡公共数据集。 运行代码时出现的错误如下:
GoogleAPICallError: None Unexpected state: Long-running operation had neither response nor error set.
它发生在从 csv 文件中提取数据的过程中。
这是我正在处理的代码:
import os
from datetime import datetime
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform_v1.types import feature as feature_pb2
from google.cloud.aiplatform_v1.types import featurestore as featurestore_pb2
from google.cloud.aiplatform_v1.types import \
featurestore_service as featurestore_service_pb2
from google.cloud.aiplatform_v1.types import entity_type as entity_type_pb2
from google.cloud.aiplatform_v1.types import FeatureSelector, IdMatcher
credential_path = r"C:\Users\...\.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
## Constants
PROJECT_ID = "my-project-ID"
REGION = "us-central1"
API_ENDPOINT = "us-central1-aiplatform.googleapis.com"
INPUT_CSV_FILE = "my-input-file.csv"
FEATURESTORE_ID = "fraud_detection"
## Output dataset
DESTINATION_DATA_SET = "fraud_predictions"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
DESTINATION_DATA_SET = "{prefix}_{timestamp}".format(
prefix=DESTINATION_DATA_SET, timestamp=TIMESTAMP
)
## Output table. Make sure that the table does NOT already exist;
## the BatchReadFeatureValues API cannot overwrite an existing table
DESTINATION_TABLE_NAME = "training_data"
DESTINATION_PATTERN = "bq://{project}.{dataset}.{table}"
DESTINATION_TABLE_URI = DESTINATION_PATTERN.format(
project=PROJECT_ID, dataset=DESTINATION_DATA_SET,
table=DESTINATION_TABLE_NAME
)
## Create dataset
client = bigquery.Client(project=PROJECT_ID)
dataset_id = "{}.{}".format(client.project, DESTINATION_DATA_SET)
dataset = bigquery.Dataset(dataset_id)
dataset.location = REGION
dataset = client.create_dataset(dataset)
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
## Create client for CRUD and data_client for reading feature values.
client = aiplatform.gapic.FeaturestoreServiceClient(
client_options={"api_endpoint": API_ENDPOINT})
data_client = aiplatform.gapic.FeaturestoreOnlineServingServiceClient(
client_options={"api_endpoint": API_ENDPOINT})
BASE_RESOURCE_PATH = client.common_location_path(PROJECT_ID, REGION)
## Create featurestore (only the first time)
create_lro = client.create_featurestore(
featurestore_service_pb2.CreateFeaturestoreRequest(
parent=BASE_RESOURCE_PATH,
featurestore_id=FEATURESTORE_ID,
featurestore=featurestore_pb2.Featurestore(
online_serving_config=featurestore_pb2.Featurestore.OnlineServingConfig(
fixed_node_count=1
),
),
)
)
## Wait for LRO to finish and get the LRO result.
print(create_lro.result())
client.get_featurestore(
name=client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID)
)
## Create credit card entity type (only the first time)
cc_entity_type_lro = client.create_entity_type(
featurestore_service_pb2.CreateEntityTypeRequest(
parent=client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID),
entity_type_id="creditcards",
entity_type=entity_type_pb2.EntityType(
description="Credit card entity",
),
)
)
## Create fraud entity type (only the first time)
fraud_entity_type_lro = client.create_entity_type(
featurestore_service_pb2.CreateEntityTypeRequest(
parent=client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID),
entity_type_id="frauds",
entity_type=entity_type_pb2.EntityType(
description="Fraud entity",
),
)
)
## Create features for credit card type (only the first time)
client.batch_create_features(
parent=client.entity_type_path(PROJECT_ID, REGION, FEATURESTORE_ID, "creditcards"),
requests=[
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v1",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v2",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v3",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v4",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v5",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v6",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v7",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v8",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v9",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v10",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v11",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v12",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v13",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v14",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v15",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v16",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v17",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v18",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v19",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v20",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v21",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v22",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v23",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v24",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v25",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v26",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v27",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="v28",
),
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="amount",
),
],
).result()
## Create features for fraud type (only the first time)
client.batch_create_features(
parent=client.entity_type_path(PROJECT_ID, REGION, FEATURESTORE_ID, "frauds"),
requests=[
featurestore_service_pb2.CreateFeatureRequest(
feature=feature_pb2.Feature(
value_type=feature_pb2.Feature.ValueType.DOUBLE, description="",
),
feature_id="class",
),
],
).result()
## Import features values for credit cards
import_cc_request = aiplatform.gapic.ImportFeatureValuesRequest(
entity_type=client.entity_type_path(
PROJECT_ID, REGION, FEATURESTORE_ID, "creditcards"),
csv_source=aiplatform.gapic.CsvSource(gcs_source=aiplatform.gapic.GcsSource(
uris=["gs://fraud-detection-19102021/dataset/cc_details_train.csv"])),
entity_id_field="cc_id",
feature_specs=[
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v1"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v2"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v3"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v4"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v5"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v6"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v7"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v8"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v9"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v10"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v11"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v12"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v13"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v14"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v15"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v16"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v17"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v18"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v19"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v20"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v21"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v22"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v23"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v24"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v25"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v26"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v27"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="v28"),
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="amount"),
],
feature_time_field='time',
worker_count=1,
)
## Start to import
ingestion_lro = client.import_feature_values(import_cc_request)
## Polls for the LRO status and prints when the LRO has completed
ingestion_lro.result()
## Import features values for frauds
import_fraud_request = aiplatform.gapic.ImportFeatureValuesRequest(
entity_type=client.entity_type_path(
PROJECT_ID, REGION, FEATURESTORE_ID, "frauds"),
csv_source=aiplatform.gapic.CsvSource(gcs_source=aiplatform.gapic.GcsSource(
uris=["gs://fraud-detection-19102021/dataset/data_fraud_train.csv"])),
entity_id_field="fraud_id",
feature_specs=[
aiplatform.gapic.ImportFeatureValuesRequest.FeatureSpec(id="class"),
],
feature_time_field='time',
worker_count=1,
)
## Start to import
ingestion_lro = client.import_feature_values(import_fraud_request)
## Polls for the LRO status and prints when the LRO has completed
ingestion_lro.result()
当我从 Google Cloud Console 的 Feature 部分检查 Ingestion Jobs 时,我看到工作已完成,但没有向我的功能添加任何值。
任何建议都非常宝贵。
谢谢大家。
编辑 1
在下图中,有一个我用作输入的 csv 文件的第一行示例 (cc_details_train.csv)。所有看不见的特征都是相似的,特征class 可以假设为 0 或 1 值。
注入作业持续大约 5 分钟以导入(理想情况下)3000 行,但它没有错误地结束,也没有导入任何值。
【问题讨论】:
-
我有一些问题,你所有的字段都是数字吗?您将所有设置为双值类型,您可以确认吗?您的处理文件的大小是多少? (示例数据使用电影数据)您可能需要检查处理时间表,here。另外,您能否在代码中准确指出错误点的位置?
-
@Betjens 感谢您的评论。 1)是的,我要摄取的所有特征值都是双精度类型。 2)我的文件大小是 3000 行。从 GCP 的“功能”部分,我看到摄取作业已完成,但功能中添加了 0 个值。 3) 当我执行 ingestion_lro.result() 时出现错误。
-
所以我挖掘了一下,发现 cc_details_train.csv 是你的实际输入文件......这需要澄清。如果您还可以提供其他信息,这将有所帮助。即:输入文件的示例行。我认为您应该使用更少的功能或尽可能增加您的工人数量。对于我的测试运行,我只使用一行,第一次需要 10-15 分钟。我无法想象 3000 行需要多少时间。
-
@Betjens 在我完成的测试中,注入工作持续大约 5 分钟以导入(理想情况下)3000 行,但它没有错误地结束并且没有导入任何值。我添加了 csv 文件第一行的图像。
-
你的 entity_id 是 cc_id???我看到你的文件还包含一个名为类的列,它包含什么???。我也复制了 3000 行并添加了 0 个值。根据official documentation,每次运行都会有延迟。另外,您是否尝试过使用顶点用户界面? Vertex-AI > 特征
标签: google-cloud-platform google-api-python-client google-cloud-ml google-cloud-vertex-ai