从现有样本数据生成合成时间序列数据答案

【问题标题】：Generate synthetic time series data from existing sample data从现有样本数据生成合成时间序列数据
【发布时间】：2020-01-02 18:18:25
【问题描述】：

python 中是否有任何好的库/工具可用于从现有样本数据生成合成时间序列数据？例如，我有 1 月至 6 月的销售数据，并希望生成 7 月至 12 月的合成时间序列数据样本）（保持时间序列因素不变，如趋势、季节性等）。

【问题讨论】：

为了什么目的？
由于数据少，模型可能会过拟合，我想生成合成样本。
你不能生成虚假数据并用它来得出见解......

标签： python machine-learning time-series

【解决方案1】：

撇开此类数据的质量问题不谈，这里有一个简单的方法，您可以使用高斯分布根据样本生成合成数据。下面是关键部分。

import numpy as np
x # original sample np.array of features
feature_means = np.mean(x, axis=1)
feature_std = np.std(x, axis=1)
random_normal_feature_values = np.random.normal(feature_means, feature_std)

这是我使用的功能齐全的代码，

def generate_synthetic_data(sample_dataset, window_mean, window_std, fixed_window=None, variance_range =1 , sythesize_ratio = 2, forced_reverse = False):
    synthetic_data = pd.DataFrame(columns=sample_dataset.columns)
    synthetic_data.insert(len(sample_dataset.columns), "synthesis_seq", [], True) 


    for k in range(sythesize_ratio):
      if len(synthetic_data) >= len(sample_dataset) * sythesize_ratio:
        break;
      #this loop generates a set that resembles the entire dataset
      country_synthetic = pd.DataFrame(columns=synthetic_data.columns)

      if fixed_window != None:
        input_sequence_len =  fixed_window
      else:
        input_sequence_len = int(np.random.normal(window_mean, window_std)) 

      #population data change
      country_data_i = sample_dataset
      if len(country_data_i) < input_sequence_len :
        continue
      feature_length = configuration['feature_length'] #number of features to be randomized
      country_data_array = country_data_i.to_numpy()
      country_data_array = country_data_array.T[:feature_length]
      country_data_array = country_data_array.reshape(feature_length,len(country_data_i))
      x = country_data_array[:feature_length].T

      reversed = np.random.normal(0,1)>0
      if reversed:
        x = x[::-1]

      sets =0
      x_list = []
      dict_x = dict()
      for i in range(input_sequence_len):
        array_len = ((len(x) -i) - ((len(x)-i)%input_sequence_len))+i
        if array_len <= 0:
          continue
        sets = int( array_len/ input_sequence_len)
        if sets <= 0:
          continue

        x_temp = x[i:array_len].T.reshape(sets,feature_length,input_sequence_len)
        uniq_keys = np.array([i+(input_sequence_len*k) for k in range(sets)])
        x_temp = x_temp.reshape(feature_length,sets,input_sequence_len)
        arrays_split = np.hsplit(x_temp,sets)
        dict_x.update(dict(zip(uniq_keys, arrays_split)))

      temp_x_list  = [dict_x[i].T for i in sorted(dict_x.keys())]        
      temp_x_list = np.array(temp_x_list).squeeze()
      feature_means = np.mean(temp_x_list, axis=1)
      feature_std = np.std(temp_x_list, axis=1) /variance_range
      random_normal_feature_values = np.random.normal(feature_means, feature_std).T
      random_normal_feature_values = np.round(random_normal_feature_values,0)
      random_normal_feature_values[random_normal_feature_values < 0] = 0

      if reversed:
        random_normal_feature_values = random_normal_feature_values.T[::-1]
        random_normal_feature_values = random_normal_feature_values.T

      for i in range(len(random_normal_feature_values)):
        country_synthetic[country_synthetic.columns[i]] = random_normal_feature_values[i]

      country_synthetic['synthesis_seq'] = k
      synthetic_data = synthetic_data.append(country_synthetic, ignore_index=True)
    return synthetic_data

for i in range(1):
  directory_name = '/synthetic_'+str(i)
  mypath = source_path+ '/cleaned'+directory_name
  if os.path.exists(mypath) == False:
    os.mkdir(mypath)

  data = generate_synthetic_data(original_data, window_mean = 0,  window_std= 0, fixed_window=2 ,variance_range = 10**i, sythesize_ratio = 1)
  synthetic_data.append(data)
  #data.to_csv(mypath+'/synthetic_'+str(i)+'_dt31_05_.csv',  index=False )
  print('synth step : ', i, ' len : ', len(synthetic_data))

祝你好运！

【讨论】：

请提供可重复的答案。 NameError: name 'source_path' is not defined。 OP 要求考虑时间序列的组成部分生成新数据集，请查看this 了解更多详情。