-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';
-- This rowid is needed by Interflow Match to maintain the order of rows while creating groups. This is a UDF (User Defined Function) and associates an incremental unique integer number to each row of the data.
CREATE TEMPORARY FUNCTION InterMatch as 'com.pb.bdq.amm.process.hive.interflow.InterMatchUDAF';
-- Inter Flow is implemented as a UDAF (User Defined Aggregation function). It processes one group of rows at a time based on join field and generates the result for that group of rows.
-- Disable map side aggregation
set hive.map.aggr = false;
-- Set the rule using configuration property 'pb.bdq.match.rule'
set pb.bdq.match.rule={"type":"Parent", "missingDataMethod":"IgnoreBlanks", "threshold":100.0, "weight":0,
"children":[{"type":"Child", "missingDataMethod":"IgnoreBlanks", "threshold":80.0, "weight":0, "matchWhenNotTrue":false, "scoringMethod":"Maximum",
"algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone", "weight":0, "options":null}],
"crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "missingDataMethod":"IgnoreBlanks", "threshold":80.0, "weight":0,
"matchWhenNotTrue":false, "scoringMethod":"Maximum",
"algorithms":[{"name":"KeyboardDistance", "weight":0, "options":null},
{"name":"Metaphone3", "weight":0, "options":null}], "crossMatchField":[],
"suspectField":"lastname", "candidateField":null}],
"scoringMethod":"Average", "matchingMethod":"AllTrue", "name":"NameData", "matchWhenNotTrue":false};
-- Set the header for suspect table using configuration property 'pb.bdq.suspect.header'
set pb.bdq.match.suspect.header=name,firstname,lastname,matchkey,middlename,recordid;
-- Set the header for candidate table using configuration property 'pb.bdq.candidate.header'
set pb.bdq.match.candidate.header=name,firstname,lastname,matchkey,middlename,recordid;
-- Set the sorting field to the candidates unique id's alias used in the query. This is not from the input data.
set pb.bdq.match.sort.field=c_id;
-- Set the express match column(optional)
set pb.bdq.match.express.column=matchkey;
-- Set sort field name to the alias used in the query, using configuration property 'pb.bdq.match.inter.comparison'
set pb.bdq.match.inter.comparison=maxNumOfDuplicates,2;
-- Optionally, one can also set 'pb.bdq.match.inter.comparison=returnUniqueCandidates,true';
-- Set sort collection number option for unique records using configuration property 'pb.bdq.match.unique.collectnumber.zero'
set pb.bdq.match.unique.collectnumber.zero=false;
-- Execute Query on the desired table. The query uses a UDF rowid, which must be present in the query to maintain the ordering of the data while reading.
SELECT lateralview.record ["MatchRecordType"],
lateralview.record ["MatchScore"],
lateralview.record ["HasDuplicate"],
lateralview.record ["CollectionNumber"],
coalesce(lateralview.record ["ExpressMatched"], ''),
lateralview.record ["SourceType"],
lateralview.record ["name"],
lateralview.record ["firstname"],
lateralview.record ["lastname"],
lateralview.record ["matchkey"],
lateralview.record ["middlename"],
lateralview.record ["recordid"]
FROM (
SELECT interMatch(s_id, s_name, s_firstname, s_lastname, s_matchkey, s_middlename, s_recordid, c_id,c_name, c_firstname, c_lastname, c_matchkey, c_middlename, c_recordid) AS
OUTPUT
FROM (
SELECT suspects.suspect_id AS s_id,
suspects.NAME AS s_name,
suspects.firstname AS s_firstname,
suspects.lastname AS s_lastname,
suspects.matchkey AS s_matchkey,
suspects.middlename AS s_middlename,
suspects.recordid AS s_recordid,
candidates.candidate_id AS c_id,
candidates.NAME AS c_name,
candidates.firstname AS c_firstname,
candidates.lastname AS c_lastname,
candidates.matchkey AS c_matchkey,
candidates.middlename AS c_middlename,
candidates.recordid AS c_recordid
FROM
(
SELECT rowid(*) AS suspect_id
,*
FROM namedataintersuspect
) AS suspects LEFT JOIN
(
SELECT rowid(*) AS candidate_id
,*
FROM namedataintercandidate
) AS candidates
on suspects.matchkey = candidates.matchkey
) AS joinrecords
GROUP BY joinrecords.s_matchkey
) AS innerResult LATERAL VIEW explode(innerResult.OUTPUT) lateralview AS record;
-- Query to dump data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/intermatch/output'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
collection items terminated by '||' map keys terminated by ':'
SELECT lateralview.record ["MatchRecordType"],
lateralview.record ["MatchScore"],
lateralview.record ["HasDuplicate"],
lateralview.record ["CollectionNumber"],
coalesce(lateralview.record ["ExpressMatched"], ''),
lateralview.record ["SourceType"],
lateralview.record ["name"],
lateralview.record ["firstname"],
lateralview.record ["lastname"],
lateralview.record ["matchkey"],
lateralview.record ["middlename"],
lateralview.record ["recordid"]
FROM (
SELECT interMatch(s_id, s_name, s_firstname, s_lastname, s_matchkey, s_middlename, s_recordid, c_id,c_name, c_firstname, c_lastname, c_matchkey, c_middlename, c_recordid) AS
OUTPUT
FROM (
SELECT suspects.suspect_id AS s_id,
suspects.NAME AS s_name,
suspects.firstname AS s_firstname,
suspects.lastname AS s_lastname,
suspects.matchkey AS s_matchkey,
suspects.middlename AS s_middlename,
suspects.recordid AS s_recordid,
candidates.candidate_id AS c_id,
candidates.NAME AS c_name,
candidates.firstname AS c_firstname,
candidates.lastname AS c_lastname,
candidates.matchkey AS c_matchkey,
candidates.middlename AS c_middlename,
candidates.recordid AS c_recordid
FROM
(
SELECT rowid(*) AS suspect_id
,*
FROM namedataintersuspect
) AS suspects LEFT JOIN
(
SELECT rowid(*) AS candidate_id
,*
FROM namedataintercandidate
) AS candidates
on suspects.matchkey = candidates.matchkey
) AS joinrecords
GROUP BY joinrecords.s_matchkey
) AS innerResult LATERAL VIEW explode(innerResult.OUTPUT) lateralview AS record;
-- Sample input Suspect data
--+------------------+----------+-----------------+-------------+------------+-----------+
--| name | firstname| lastname | matchkey | middlename | recordid |
--+------------------+----------+-----------------+-------------+------------+-----------+
--| LAURA ABADSANTOS| LAURA | ABADSANTOS | L | | 1 |
--+------------------+----------+-----------------+-------------+------------+-----------+
-- Sample input candidate data
--+------------------+----------+-----------------+-------------+------------+-----------+
--| name | firstname| lastname | matchkey | middlename | recordid |
--+------------------+----------+-----------------+-------------+------------+-----------+
--| KATHRYN E ABATE | KATHRYN | ABATE | L | E | 3 |
--| ANNA ABAYEV | ANNA | ABAYEV | L | | 5 |
--+------------------+----------+-----------------+-------------+------------+-----------+
-- Sample output data
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+
--|MatchRecordType|MatchScore|HasDuplicate|CollectionNumber|ExpressMatched|SourceType| name | firstname| lastname|matchkey|middlename| recordid |
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+
--|S |0 |Y |0-0-1 | |S |LAURA ABADSA| LAURA |ABADSANTO| L | | 1 |
--|D |80 |D |0-0-1 |N |C |KATHRYN E AB| KATHRYN |AB | L | E | 3 |
--|D |90 |D |0-0-1 |N |C |ANNA ABAYEV | ANNA |ABAYEV | L | | 5 |
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+