在建议的基础上添加 PubSub,这是一个工作版本..
先决条件
- 已创建 GCS 存储桶(用于 Dataflow 临时/暂存文件)
- PubSub 主题已创建
- PubSub 订阅已创建
- BigTable 实例已创建
- BigTable 表已创建
- BigTable 必须创建列族(否则没有可见错误!)
以cbt为后者的示例:
cbt -instance test-instance createfamily test-table cf1
代码
定义并运行数据流管道。
# Packages
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from google.cloud import pubsub_v1
# Classes
class CreateRowFn(beam.DoFn):
def __init__(self, pipeline_options):
self.instance_id = pipeline_options.bigtable_instance
self.table_id = pipeline_options.bigtable_table
def process(self, key):
from google.cloud.bigtable import row
import datetime
direct_row = row.DirectRow(row_key=key)
direct_row.set_cell(
'cf1',
'field1',
'value1',
timestamp=datetime.datetime.now())
yield direct_row
# Options
class XyzOptions(PipelineOptions):
@classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--bigtable_project', default='nested'),
parser.add_argument('--bigtable_instance', default='instance'),
parser.add_argument('--bigtable_table', default='table')
pipeline_options = XyzOptions(
save_main_session=True, streaming=True,
runner='DataflowRunner',
project=PROJECT,
region=REGION,
temp_location=TEMP_LOCATION,
staging_location=STAGING_LOCATION,
requirements_file=REQUIREMENTS_FILE,
bigtable_project=PROJECT,
bigtable_instance=INSTANCE,
bigtable_table=TABLE)
# Pipeline
def run (argv=None):
with beam.Pipeline(options=pipeline_options) as p:
input_subscription=f"projects/{PROJECT}/subscriptions/{SUBSCRIPTION}"
_ = (p
| 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(subscription=input_subscription).with_output_types(bytes)
| 'Conversion UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
| 'Conversion string to row object' >> beam.ParDo(CreateRowFn(pipeline_options))
| 'Writing row object to BigTable' >> WriteToBigTable(project_id=pipeline_options.bigtable_project,
instance_id=pipeline_options.bigtable_instance,
table_id=pipeline_options.bigtable_table))
if __name__ == '__main__':
run()
向 PubSub 主题发布消息 b"phone#1111"(例如,使用 Python PublisherClient())。
表格内容(使用happybase)
b'phone#1111': {b'cf1:field1': b'value1'}
Row length: 1