-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';
-- This rowid is needed by Transactional Match to maintain the
order of rows while creating groups. This is a UDF (User Defined Function) and
associates an incremental unique integer number to each row of the data.
CREATE TEMPORARY FUNCTION transactionalMatch as
'com.pb.bdq.amm.process.hive.transactional.TransactionalMatchUDAF';
-- Transactional Match is implemented as a UDAF (User Defined Aggregation function).
It processes one group of rows at a time and generates the result for that group of rows.
-- Disable map side aggregation
set hive.map.aggr = false;
-- Set the rule using configuration property 'hivevar:rule'
set hivevar:rule='{"type":"Parent", "children":[{"type":"Child", "matchWhenNotTrue":false,
"threshold":80.0, "weight":0, "algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone","weight":0,"options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[],
"suspectField":"firstname", "candidateField":null},
{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0,
"weight":0, "algorithms":[{"name":"KeyboardDistance","weight":0,"options":null},
{"name":"Metaphone3","weight":0,"options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks",
"crossMatchField":[], "suspectField":"lastname", "candidateField":null}],
"matchingMethod":"AllTrue", "scoringMethod":"Average", "missingDataMethod":"IgnoreBlanks",
"name":"NameData", "matchWhenNotTrue":false, "threshold":100, "weight":0}';
-- Set header(along with id field alias used in query) using configuration property 'hivevar:header'
set hivevar:header='name,firstname,lastname,matchkey,middlename,recordid,id';
-- Set sort field name to the alias used in the query,
using the configuration property 'hivevar:sort'
set hivevar:sort='id';
-- Set sort collection number option for unique records using configuration property
'hivevar:uniquecolumnsreturn'. The default value is false.
set hivevar:uniquecolumnsreturn='true';
-- Execute Query on the desired table. The query uses a UDF rowid,
which must be present in the query to maintain the ordering of the data while reading.
-- Transactional Match returns a list of map containing <key=value> pairs.
Each map in the list corresponds to a row in the group. The below query
explodes that list of map and fetches fields from map by keys.
SELECT tmp2.record["MatchRecordType"],
tmp2.record["MatchScore"],
tmp2.record["HasDuplicate"],
tmp2.record["name"],
tmp2.record["firstname"],
tmp2.record["lastname"],
tmp2.record["matchkey"],
tmp2.record["middlename"],
tmp2.record["recordid"]
FROM (
SELECT transactionalMatch(innerRowID.name, innerRowID.firstname,
innerRowID.lastname, innerRowID.matchkey, innerRowID.middlename,
innerRowID.recordid, innerRowID.id
) AS matchgroup
FROM (
SELECT name, firstname, lastname, matchkey, middlename,
recordid, rowid(name, firstname, lastname, matchkey, middlename,
recordid) AS id FROM customer_data
) innerRowID
GROUP BY matchkey
) As innerResult
LATERAL VIEW explode(innerResult.matchgroup) tmp2 as record ;
-- Query to dump output to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/transmatch/' ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' collection items terminated by '||' map keys terminated by ':'
SELECT tmp2.record["MatchRecordType"],
tmp2.record["MatchScore"],
tmp2.record["HasDuplicate"],
tmp2.record["name"],
tmp2.record["firstname"],
tmp2.record["lastname"],
tmp2.record["matchkey"],
tmp2.record["middlename"],
tmp2.record["recordid"]
FROM (
SELECT transactionalMatch(${hivevar:rule},
${hivevar:sort},
${hivevar:uniquecolumnsreturn},
${hivevar:header},
innerRowID.name,
innerRowID.firstname,
innerRowID.lastname,
innerRowID.matchkey,
innerRowID.middlename,
innerRowID.recordid,
innerRowID.id) as matchgroup
FROM (
SELECT name, firstname, lastname, matchkey, middlename,
recordid, rowid(name, firstname, lastname, matchkey, middlename, recordid) AS id
FROM customer_data
) innerRowID
GROUP BY matchkey ) As innerResult
LATERAL VIEW explode(innerResult.matchgroup) tmp2 as record ;
--sample input data
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--| name | firstname | lastname | matchkey | middlename | recordid |
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--| ZORINA ABDOOL | ZORINA | ABDOOL | Z | | 12 |
--| ZULFIQAR ALI | ZULFIQAR | ALI | Z | | 116 |
--| ZACHARY BENNETT | ZACHARY | BENNETT | Z | | 515 |
--| ZOHAR BUERGER | ZOHAR | BUERGER | Z | | 889 |
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--sample output data
--+----------------+---------------------+----------+------------+-----------+-----------------+------------+--------------+
--|name |firstname | lastname | matchkey | middlename | recordid | MatchRecordType | MatchScore | HasDuplicate |
--+----------------+----------+----------+----------+------------+-----------+-----------------+------------+--------------+
--|ZORINA ABDOOL |ZORINA | ABDOOL | Z | | 12 | S | 0 | Y |
--|ZULFIQAR ALI |ZULFIQAR | ALI | Z | | 116 | D | 90 | D |
--|ZACHARY BENNETT|ZACHARY | BENNETT | Z | | 515 | D | 91 | D |
--|ZOHAR BUERGER |ZOHAR | BUERGER | Z | | 889 | D | 91 | D |
--+----------------+----------+---------------------+------------------------+-----------------+------------+--------------+