Sample Hive Script

-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). 
String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';

-- This rowid is needed by Interflow Match to maintain the 
order of rows while creating groups. This is a UDF (User Defined Function) 
and associates an incremental unique integer number to each row of the data.

CREATE TEMPORARY FUNCTION InterMatch as 'com.pb.bdq.amm.process.hive.interflow.InterMatchUDAF';

-- Inter Flow is implemented as a UDAF (User Defined Aggregation function).
 It processes one group of rows at a time based on join field
 and generates the result for that group of rows.

-- Disable map side aggregation 
set hive.map.aggr = false;

-- Set the rule using configuration property 'hivevar:rule'

set hivevar:rule='{"type":"Parent", "missingDataMethod":
"IgnoreBlanks", "threshold":100.0, "weight":0,
"children":[{"type":"Child", "missingDataMethod":"IgnoreBlanks", "threshold":80.0,
 "weight":0, "matchWhenNotTrue":false, "scoringMethod":"Maximum",
"algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone", "weight":0, "options":null}],
"crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "missingDataMethod":"IgnoreBlanks",
 "threshold":80.0, "weight":0,
"matchWhenNotTrue":false, "scoringMethod":"Maximum",
"algorithms":[{"name":"KeyboardDistance", 
"weight":0, "options":null},
{"name":"Metaphone3", "weight":0,
 "options":null}], "crossMatchField":[],
"suspectField":"lastname", "candidateField":null}],
"scoringMethod":"Average", "matchingMethod":"AllTrue",
 "name":"NameData", "matchWhenNotTrue":false}';


-- Set the header for suspect table using configuration property 'hivevar:suspectheader'
set hivevar:suspectheader='name,firstname,lastname,matchkey,middlename,recordid';

-- Set the header for candidate table using configuration property 'hivevar:Candidateheader'
set hivevar:Candidateheader='name,firstname,lastname,matchkey,middlename,recordid';

-- Set the sorting field to the candidates unique id's
 alias used in the query. This is not from the input data.
set hivevar:sortfield='c_id';

-- Set the express match column(optional)
set hivevar:expressMatchColumn='matchkey';


-- Optionally, one can also set 'hivevar:intercomparison='returnUniqueCandidates,true''; 
set hivevar:intercomparison='returnUniqueCandidates,true'; 

-- Set sort collection number option for unique records using
 configuration property 'hivevar:collectionNumberZero'
set hivevar:collectionNumberZero='false';

-- Execute Query on the desired table. The query uses a UDF rowid, 
which must be present in the query to maintain the ordering of the data while reading.

SELECT innerresult.record ["MatchRecordType"],
	innerresult.record ["MatchScore"],
	innerresult.record ["HasDuplicate"],
	innerresult.record ["CollectionNumber"],
	coalesce(innerresult.record ["ExpressMatched"], ''),
	innerresult.record ["SourceType"],
	innerresult.record ["name"],
	innerresult.record ["firstname"],
	innerresult.record ["lastname"],
	innerresult.record ["matchkey"],
	innerresult.record ["middlename"],
	innerresult.record ["recordid"]
FROM (
	SELECT interMatch(${hivevar:rule},${hivevar:sortfield},${hivevar:expressMatchColumn},
       ${hivevar:collectionNumberZero},${hivevar:interComparison},${hivevar:Candidateheader}
       ${hivevar:Suspectheader},unionresults.id,unionresults.name,unionresults.firstname,
        unionresults.lastname, unionresults.matchkey, unionresults.middlename,
        unionresults.recordid ,unionresults.TYPE)
       AS matchgroup
    FROM (
	SELECT rowid(*) AS id, 'Suspect' AS TYPE,fullname as name,fname
    as firstname,lname as lastname,matchkey as matchkey,mname as middlename,recordid as recordid
    FROM customer_name_suspect
    UNION ALL 
	SELECT rowid(*) AS id , 'Candidate' AS TYPE, name as name,firstname as
 firstname,lastname as lastname,matchkey as matchkey,middlename as middlename ,customerid as recordid
    FROM customer_name_candidate) unionresults
    GROUP BY matchkey) AS innerResult LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record;



-- Query to dump data to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/intermatch/output' 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
collection items terminated by '||' map keys terminated by ':'
SELECT innerresult.record ["MatchRecordType"],
	innerresult.record ["MatchScore"],
	innerresult.record ["HasDuplicate"],
	innerresult.record ["CollectionNumber"],
	coalesce(innerresult.record ["ExpressMatched"], ''),
	innerresult.record ["SourceType"],
	innerresult.record ["name"],
	innerresult.record ["firstname"],
	innerresult.record ["lastname"],
	innerresult.record ["matchkey"],
	innerresult.record ["middlename"],
	innerresult.record ["recordid"]
FROM (
	SELECT interMatch(${hivevar:rule},${hivevar:sortfield},${hivevar:expressMatchColumn},
       ${hivevar:collectionNumberZero},${hivevar:interComparison},
       ${hivevar:Candidateheader},${hivevar:Suspectheader},
       unionresults.id,unionresults.name,unionresults.firstname, 
       unionresults.lastname, unionresults.matchkey,
       unionresults.middlename,unionresults.recordid ,unionresults.TYPE) AS matchgroup
    FROM (
	SELECT rowid(*) AS id, 'Suspect' AS TYPE,fullname as name,fname as
       firstname,lname as lastname,matchkey as matchkey,mname as middlename,recordid as recordid
    FROM customer_name_suspect
    UNION ALL 
	SELECT rowid(*) AS id , 'Candidate' AS TYPE, name as name,
    firstname as firstname,lastname as lastname,matchkey as matchkey,
    middlename as middlename ,customerid as recordid
    FROM customer_name_candidate) unionresults
    GROUP BY matchkey) AS innerResult LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record;


-- Sample input Suspect data

--+------------------+----------+-----------------+-------------+------------+-----------+
--| name             | firstname| lastname        | matchkey    | middlename | recordid  |
--+------------------+----------+-----------------+-------------+------------+-----------+
--| LAURA  ABADSANTOS| LAURA    | ABADSANTOS      | L           |            | 1         | 
--+------------------+----------+-----------------+-------------+------------+-----------+

-- Sample input candidate data

--+------------------+----------+-----------------+-------------+------------+-----------+
--| name             | firstname| lastname        | matchkey    | middlename | recordid  |
--+------------------+----------+-----------------+-------------+------------+-----------+
--| KATHRYN E ABATE  | KATHRYN  | ABATE           | L           |  E         | 3         | 
--| ANNA  ABAYEV     | ANNA     | ABAYEV          | L           |            | 5         | 
--+------------------+----------+-----------------+-------------+------------+-----------+

-- Sample output data

--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+
--|MatchRecordType|MatchScore|HasDuplicate|CollectionNumber|ExpressMatched|SourceType| name       | firstname| lastname|matchkey|middlename| recordid  |
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+
--|S              |0         |Y           |0-0-1           |              |S         |LAURA ABADSA| LAURA    |ABADSANTO|    L   |          | 1         |
--|D              |80        |D           |0-0-1           |N             |C         |KATHRYN E AB| KATHRYN  |AB       |    L   |    E     | 3         |
--|D              |90        |D           |0-0-1           |N             |C         |ANNA ABAYEV | ANNA     |ABAYEV   |    L   |          | 5         |
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+