要求:查找并记录与对应主记录匹配的交易记录。
必须保留与主记录匹配的所有交易记录的列表。
比较基于“日期”和金额。 (更新为轻松允许当前行中的任何值,因为您传递了一个“列名”数组以供使用。)
问题是,如果数组未按您要比较的键排序,这可能会变得非常昂贵。
一种方法是为每个“数据关键字段”生成一个唯一但易于生成且大小固定的“关键”,以便于比较。
然后使用这些“密钥”为原始记录生成一个“生成的密钥”lookup 数组。
这样就不必在要比较的字段上提供排序数据。但是,生成的查找数组必须适合内存。
我决定对连接的数据键使用 MD5 哈希。在这个应用程序中,碰撞的机会并不重要。 MD5 在生成唯一哈希方面非常出色。它也很快。
Working code at eval.in
Source Code
完成工作的类:
// ---------------------------------------------------------------------------------
class HashMatch {
/*
* Generate a MD5 hash for each master and tranasaction using some
* of the data fields as the string to be hashed.
*/
/**
* Master source records
*
* @var array
*/
private $master = null;
/**
* Transaction Source records must have the same field names as the master
* of the indexes that are used to generate the MD5 hash
*
* @var array
*/
private $transaction = null;
/**
* The generated MD5 hash is the key in the Master source records.
*
* Each record has a list of other Master Record Ids that also have the same hash
*
* @var array
*/
private $hashMaster = array();
/**
* The generated MD5 hash is the key in the Transaction source records.
*
* Each record has a list of other Transaction Record Ids that also have the same hash
*
* @var array
*/
private $hashTransaction = array();
/**
* Specify which index names to use from the supplied data record arrays
* to generate the MD5 hash with.
*
* @var array
*/
private $keyNames = array();
/**
* Generate a MD5 hash for each master and transaction using some
* of the data fields as the string to be hashed.
*
* You can pass an array of field names to used to generate the key.
*
* This allows any records to be used in this class as you just provide
* the li9st of names to generate the MD5 hash
*
*
* @param array $master
* @param array $transaction
* @param array $keyNames
*
* @return void
*/
public function __construct(array $master,
array $transaction,
array $keyNames = array('when', 'amount'))
{
$this->master = $master;
$this->transaction = $transaction;
$this->keyNames = $keyNames;
}
/**
* Generate all the Hashes and store all the matching details
*
* @return bool
*/
public function generateMatches()
{
$this->processMaster();
$this->processTransaction();
return !empty($this->hashMaster) && !empty($this->hashTransaction);
}
/**
* Generate a list of MD5 hashes as a key
*
* Keep a list of other master records with the same hash
*
* @return void
*/
public function processMaster()
{
foreach ($this->master as $recordId => $data) {
$hash = $this->generateHash($data);
if (empty($this->hashMaster[$hash])) { // add it...
$this->hashMaster[$hash]['masterId'] = $recordId;
$this->hashMaster[$hash]['matchIds'] = array($recordId);
}
else { // is a duplicate so add to the match list
$this->hashMaster[$hash]['matchIds'][] = $recordId;
}
}
}
/**
* Generate a list of MD5 hashes as a key for the Transaction source
*
* Match the hashes against the master list and record if there is a match
*
* @return void
*/
public function processTransaction()
{
foreach ($this->transaction as $recordId => $data) {
$hash = $this->generateHash($data);
if (empty($this->hashMaster[$hash])) { // skip this record
continue;
}
// record a match with the master
if (empty($this->hashTransaction[$hash])) { // new record
$this->hashTransaction[$hash]['masterId'] = $this->hashMaster[$hash]['masterId'];
$this->hashTransaction[$hash]['matchIds'] = array();
}
// add to the list of matches
$this->hashTransaction[$hash]['matchIds'][] = $recordId;
}
}
/**
* Return Master MD5 list
*
* The keys are unique, however there are extra values:
*
* 'masterId' ==> The first record in the array with this key
*
* 'matchIds' ==> A *complete* list of all the master records that have this key.
* Yes, it includes itself, this allows you to just use this list
* when reporting.
*
* @return array
*/
public function getHashMasterList()
{
return $this->hashMaster;
}
/**
* Return Master MD5 list with more that one matching master
*
* i.e. duplicate master records with the same hash
*
* @return array
*/
public function getHashMatchedMasterList()
{
$out = array();
foreach ($this->hashMaster as $key => $item) {
if (count($item['matchIds']) >= 2) {
$out[$key] = $item;
}
}
return $out;
}
/**
* All the transactions that matched a master record
*
* @return array
*/
public function getHashTransactionList()
{
return $this->hashTransaction;
}
/**
* given a master hash then return the details as:
*
* i.e. this converts a hash key back into source records for processing.
*
* 1) A list of matching master records
*
* e.g. $out['master'][] ...
*
*
* 2) A list of matching transaction records
*
* e.g. $out['transaction'][] ...
*
* @param string $hash
*
* @return array
*/
public function getMatchedRecords($hash)
{
$out = array('key' => $hash,
'master' => array(),
'transaction' => array(),
);
if (!empty($this->hashMaster[$hash])) { // just in case is invalid hash
foreach ($this->hashMaster[$hash]['matchIds'] as $recordId) {
$out['master'][] = $this->master[$recordId];
}
}
if (!empty($this->hashTransaction[$hash])) {
foreach ($this->hashTransaction[$hash]['matchIds'] as $recordId) {
$out['transaction'][] = $this->transaction[$recordId];
}
}
return $out;
}
/**
* Generate an MD5 hash from the required fields in the data record
* The columns to use will have been passed in the constructor
* and found in '$keyNames'
*
* It is so you don't have to edit anything to use this class
*
* @param array $row
*
* @return string
*/
public function generateHash($row)
{
$text = '';
foreach ($this->keyNames as $name) {
$text .= $row[$name];
}
return Md5($text);
}
}
解释...
later....
运行它的代码:
// !!!! You can pass the names of the fields to be used to generate the key
$match = new HashMatch($master,
$transaction,
array('whenDone', 'amount'));
$match->generateMatches();
// print output...
echo '<pre>Hash Master Records with multiple Matching Masters ... ', PHP_EOL;
print_r($match->getHashMatchedMasterList());
echo '</pre>';
输出:
Matching Master to Transaction...
Array
(
[key] => 296099e19b77aad413600a1e2f2cb3cd
[master] => Array
(
[0] => Array
(
[name] => John Matched
[whenDone] => 2016-04-01
[amount] => 12345
[email] => johnMatched@y.com
)
[1] => Array
(
[name] => Jane Matched
[whenDone] => 2016-04-01
[amount] => 12345
[email] => janeMatched@y.com
)
)
[transaction] => Array
(
[0] => Array
(
[name] => John Doe
[whenDone] => 2016-04-01
[amount] => 12345
[email] => johndoe@y.com
)
[1] => Array
(
[name] => micky mean
[whenDone] => 2016-04-01
[amount] => 12345
[email] => mickym@y.com
)
)
)
测试数据
$master[] = array('name' => 'First last', 'whenDone' => '2016-03-03', 'amount' => 12000, 'email' => 'sample@y.com', );
$master[] = array('name' => 'John Matched', 'whenDone' => '2016-04-01', 'amount' => 12345, 'email' => 'johnMatched@y.com');
$master[] = array('name' => 'Jane Unmatched', 'whenDone' => '2016-05-02', 'amount' => 12345, 'email' => 'janeUnmatched@y.com');
$master[] = array('name' => 'Jane Matched', 'whenDone' => '2016-04-01', 'amount' => 12345, 'email' => 'janeMatched@y.com');
$transaction[] = array('name' => 'Mary Lamb', 'whenDone' => '2016-03-04', 'amount' => 12000, 'email' => 'maryl@y.com');
$transaction[] = array('name' => 'John Doe', 'whenDone' => '2016-04-01', 'amount' => 12345, 'email' => 'johndoe@y.com');
$transaction[] = array('name' => 'micky mean', 'whenDone' => '2016-04-01', 'amount' => 12345, 'email' => 'mickym@y.com');