-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';
-- This rowid is needed by Interflow Match to maintain the
order of rows while creating groups. This is a UDF (User Defined Function)
and associates an incremental unique integer number to each row of the data.
CREATE TEMPORARY FUNCTION InterMatch as 'com.pb.bdq.amm.process.hive.interflow.InterMatchUDAF';
-- Inter Flow is implemented as a UDAF (User Defined Aggregation function).
It processes one group of rows at a time based on join field
and generates the result for that group of rows.
-- Disable map side aggregation
set hive.map.aggr = false;
-- Set the rule using configuration property 'hivevar:rule'
set hivevar:rule='{"type":"Parent", "missingDataMethod":
"IgnoreBlanks", "threshold":100.0, "weight":0,
"children":[{"type":"Child", "missingDataMethod":"IgnoreBlanks", "threshold":80.0,
"weight":0, "matchWhenNotTrue":false, "scoringMethod":"Maximum",
"algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone", "weight":0, "options":null}],
"crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "missingDataMethod":"IgnoreBlanks",
"threshold":80.0, "weight":0,
"matchWhenNotTrue":false, "scoringMethod":"Maximum",
"algorithms":[{"name":"KeyboardDistance",
"weight":0, "options":null},
{"name":"Metaphone3", "weight":0,
"options":null}], "crossMatchField":[],
"suspectField":"lastname", "candidateField":null}],
"scoringMethod":"Average", "matchingMethod":"AllTrue",
"name":"NameData", "matchWhenNotTrue":false}';
-- Set the header for suspect table using configuration property 'hivevar:suspectheader'
set hivevar:suspectheader='name,firstname,lastname,matchkey,middlename,recordid';
-- Set the header for candidate table using configuration property 'hivevar:Candidateheader'
set hivevar:Candidateheader='name,firstname,lastname,matchkey,middlename,recordid';
-- Set the sorting field to the candidates unique id's
alias used in the query. This is not from the input data.
set hivevar:sortfield='c_id';
-- Set the express match column(optional)
set hivevar:expressMatchColumn='matchkey';
-- Optionally, one can also set 'hivevar:intercomparison='returnUniqueCandidates,true'';
set hivevar:intercomparison='returnUniqueCandidates,true';
-- Set sort collection number option for unique records using
configuration property 'hivevar:collectionNumberZero'
set hivevar:collectionNumberZero='false';
-- Execute Query on the desired table. The query uses a UDF rowid,
which must be present in the query to maintain the ordering of the data while reading.
SELECT innerresult.record ["MatchRecordType"],
innerresult.record ["MatchScore"],
innerresult.record ["HasDuplicate"],
innerresult.record ["CollectionNumber"],
coalesce(innerresult.record ["ExpressMatched"], ''),
innerresult.record ["SourceType"],
innerresult.record ["name"],
innerresult.record ["firstname"],
innerresult.record ["lastname"],
innerresult.record ["matchkey"],
innerresult.record ["middlename"],
innerresult.record ["recordid"]
FROM (
SELECT interMatch(${hivevar:rule},${hivevar:sortfield},${hivevar:expressMatchColumn},
${hivevar:collectionNumberZero},${hivevar:interComparison},${hivevar:Candidateheader}
${hivevar:Suspectheader},unionresults.id,unionresults.name,unionresults.firstname,
unionresults.lastname, unionresults.matchkey, unionresults.middlename,
unionresults.recordid ,unionresults.TYPE)
AS matchgroup
FROM (
SELECT rowid(*) AS id, 'Suspect' AS TYPE,fullname as name,fname
as firstname,lname as lastname,matchkey as matchkey,mname as middlename,recordid as recordid
FROM customer_name_suspect
UNION ALL
SELECT rowid(*) AS id , 'Candidate' AS TYPE, name as name,firstname as
firstname,lastname as lastname,matchkey as matchkey,middlename as middlename ,customerid as recordid
FROM customer_name_candidate) unionresults
GROUP BY matchkey) AS innerResult LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record;
-- Query to dump data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/intermatch/output'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
collection items terminated by '||' map keys terminated by ':'
SELECT innerresult.record ["MatchRecordType"],
innerresult.record ["MatchScore"],
innerresult.record ["HasDuplicate"],
innerresult.record ["CollectionNumber"],
coalesce(innerresult.record ["ExpressMatched"], ''),
innerresult.record ["SourceType"],
innerresult.record ["name"],
innerresult.record ["firstname"],
innerresult.record ["lastname"],
innerresult.record ["matchkey"],
innerresult.record ["middlename"],
innerresult.record ["recordid"]
FROM (
SELECT interMatch(${hivevar:rule},${hivevar:sortfield},${hivevar:expressMatchColumn},
${hivevar:collectionNumberZero},${hivevar:interComparison},
${hivevar:Candidateheader},${hivevar:Suspectheader},
unionresults.id,unionresults.name,unionresults.firstname,
unionresults.lastname, unionresults.matchkey,
unionresults.middlename,unionresults.recordid ,unionresults.TYPE) AS matchgroup
FROM (
SELECT rowid(*) AS id, 'Suspect' AS TYPE,fullname as name,fname as
firstname,lname as lastname,matchkey as matchkey,mname as middlename,recordid as recordid
FROM customer_name_suspect
UNION ALL
SELECT rowid(*) AS id , 'Candidate' AS TYPE, name as name,
firstname as firstname,lastname as lastname,matchkey as matchkey,
middlename as middlename ,customerid as recordid
FROM customer_name_candidate) unionresults
GROUP BY matchkey) AS innerResult LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record;
-- Sample input Suspect data
--+------------------+----------+-----------------+-------------+------------+-----------+
--| name | firstname| lastname | matchkey | middlename | recordid |
--+------------------+----------+-----------------+-------------+------------+-----------+
--| LAURA ABADSANTOS| LAURA | ABADSANTOS | L | | 1 |
--+------------------+----------+-----------------+-------------+------------+-----------+
-- Sample input candidate data
--+------------------+----------+-----------------+-------------+------------+-----------+
--| name | firstname| lastname | matchkey | middlename | recordid |
--+------------------+----------+-----------------+-------------+------------+-----------+
--| KATHRYN E ABATE | KATHRYN | ABATE | L | E | 3 |
--| ANNA ABAYEV | ANNA | ABAYEV | L | | 5 |
--+------------------+----------+-----------------+-------------+------------+-----------+
-- Sample output data
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+
--|MatchRecordType|MatchScore|HasDuplicate|CollectionNumber|ExpressMatched|SourceType| name | firstname| lastname|matchkey|middlename| recordid |
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+
--|S |0 |Y |0-0-1 | |S |LAURA ABADSA| LAURA |ABADSANTO| L | | 1 |
--|D |80 |D |0-0-1 |N |C |KATHRYN E AB| KATHRYN |AB | L | E | 3 |
--|D |90 |D |0-0-1 |N |C |ANNA ABAYEV | ANNA |ABAYEV | L | | 5 |
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+