Exemple de script Hive

-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';

-- This rowid is needed by Interflow Match to maintain the order of rows while creating groups. This is a UDF (User Defined Function) and associates an incremental unique integer number to each row of the data.

CREATE TEMPORARY FUNCTION InterMatch as 'com.pb.bdq.amm.process.hive.interflow.InterMatchUDAF';

-- Inter Flow is implemented as a UDAF (User Defined Aggregation function). It processes one group of rows at a time based on join field and generates the result for that group of rows.

-- Disable map side aggregation 
set hive.map.aggr = false;

-- Set the rule using configuration property 'pb.bdq.match.rule'

set pb.bdq.match.rule={"type":"Parent", "missingDataMethod":"IgnoreBlanks", "threshold":100.0, "weight":0,
"children":[{"type":"Child", "missingDataMethod":"IgnoreBlanks", "threshold":80.0, "weight":0, "matchWhenNotTrue":false, "scoringMethod":"Maximum",
"algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone", "weight":0, "options":null}],
"crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "missingDataMethod":"IgnoreBlanks", "threshold":80.0, "weight":0,
"matchWhenNotTrue":false, "scoringMethod":"Maximum",
"algorithms":[{"name":"KeyboardDistance", "weight":0, "options":null},
{"name":"Metaphone3", "weight":0, "options":null}], "crossMatchField":[],
"suspectField":"lastname", "candidateField":null}],
"scoringMethod":"Average", "matchingMethod":"AllTrue", "name":"NameData", "matchWhenNotTrue":false};


-- Set the header for suspect table using configuration property 'pb.bdq.suspect.header'
set pb.bdq.match.suspect.header=name,firstname,lastname,matchkey,middlename,recordid;

-- Set the header for candidate table using configuration property 'pb.bdq.candidate.header'
set pb.bdq.match.candidate.header=name,firstname,lastname,matchkey,middlename,recordid;

-- Set the sorting field to the candidates unique id's alias used in the query. This is not from the input data.
set pb.bdq.match.sort.field=c_id;

-- Set the express match column(optional)
set pb.bdq.match.express.column=matchkey;

-- Set sort field name to the alias used in the query, using configuration property 'pb.bdq.match.inter.comparison'
set pb.bdq.match.inter.comparison=maxNumOfDuplicates,2;

-- Optionally, one can also set 'pb.bdq.match.inter.comparison=returnUniqueCandidates,true'; 

-- Set sort collection number option for unique records using configuration property 'pb.bdq.match.unique.collectnumber.zero'
set pb.bdq.match.unique.collectnumber.zero=false;

-- Execute Query on the desired table. The query uses a UDF rowid, which must be present in the query to maintain the ordering of the data while reading.

SELECT lateralview.record ["MatchRecordType"],
	lateralview.record ["MatchScore"],
	lateralview.record ["HasDuplicate"],
	lateralview.record ["CollectionNumber"],
	coalesce(lateralview.record ["ExpressMatched"], ''),
	lateralview.record ["SourceType"],
	lateralview.record ["name"],
	lateralview.record ["firstname"],
	lateralview.record ["lastname"],
	lateralview.record ["matchkey"],
	lateralview.record ["middlename"],
	lateralview.record ["recordid"]
FROM (
	SELECT interMatch(s_id, s_name, s_firstname, s_lastname, s_matchkey, s_middlename, s_recordid, c_id,c_name, c_firstname, c_lastname, c_matchkey, c_middlename, c_recordid) AS
	OUTPUT
	FROM (
		SELECT suspects.suspect_id AS s_id,
			suspects.NAME AS s_name,
			suspects.firstname AS s_firstname,
			suspects.lastname AS s_lastname,
			suspects.matchkey AS s_matchkey,
			suspects.middlename AS s_middlename,
			suspects.recordid AS s_recordid,
			candidates.candidate_id AS c_id,
			candidates.NAME AS c_name,
			candidates.firstname AS c_firstname,
			candidates.lastname AS c_lastname,
			candidates.matchkey AS c_matchkey,
			candidates.middlename AS c_middlename,
			candidates.recordid AS c_recordid
		FROM 
		
		   (
			SELECT rowid(*) AS suspect_id
				,*
			FROM namedataintersuspect
			) AS suspects LEFT JOIN
			(
				SELECT rowid(*) AS candidate_id
				,*
				FROM namedataintercandidate
			) AS candidates
			on suspects.matchkey = candidates.matchkey 	
		
		) AS joinrecords
	GROUP BY joinrecords.s_matchkey
	) AS innerResult LATERAL VIEW explode(innerResult.OUTPUT) lateralview AS record;



-- Query to dump data to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/intermatch/output' 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
collection items terminated by '||' map keys terminated by ':'
SELECT lateralview.record ["MatchRecordType"],
	lateralview.record ["MatchScore"],
	lateralview.record ["HasDuplicate"],
	lateralview.record ["CollectionNumber"],
	coalesce(lateralview.record ["ExpressMatched"], ''),
	lateralview.record ["SourceType"],
	lateralview.record ["name"],
	lateralview.record ["firstname"],
	lateralview.record ["lastname"],
	lateralview.record ["matchkey"],
	lateralview.record ["middlename"],
	lateralview.record ["recordid"]
FROM (
	SELECT interMatch(s_id, s_name, s_firstname, s_lastname, s_matchkey, s_middlename, s_recordid, c_id,c_name, c_firstname, c_lastname, c_matchkey, c_middlename, c_recordid) AS
	OUTPUT
	FROM (
		SELECT suspects.suspect_id AS s_id,
			suspects.NAME AS s_name,
			suspects.firstname AS s_firstname,
			suspects.lastname AS s_lastname,
			suspects.matchkey AS s_matchkey,
			suspects.middlename AS s_middlename,
			suspects.recordid AS s_recordid,
			candidates.candidate_id AS c_id,
			candidates.NAME AS c_name,
			candidates.firstname AS c_firstname,
			candidates.lastname AS c_lastname,
			candidates.matchkey AS c_matchkey,
			candidates.middlename AS c_middlename,
			candidates.recordid AS c_recordid
		FROM 
		
		   (
			SELECT rowid(*) AS suspect_id
				,*
			FROM namedataintersuspect
			) AS suspects LEFT JOIN
			(
				SELECT rowid(*) AS candidate_id
				,*
				FROM namedataintercandidate
			) AS candidates
			on suspects.matchkey = candidates.matchkey 	
		
		) AS joinrecords
	GROUP BY joinrecords.s_matchkey
	) AS innerResult LATERAL VIEW explode(innerResult.OUTPUT) lateralview AS record;


-- Sample input Suspect data

--+------------------+----------+-----------------+-------------+------------+-----------+
--| name             | firstname| lastname        | matchkey    | middlename | recordid  |
--+------------------+----------+-----------------+-------------+------------+-----------+
--| LAURA  ABADSANTOS| LAURA    | ABADSANTOS      | L           |            | 1         | 
--+------------------+----------+-----------------+-------------+------------+-----------+

-- Sample input candidate data

--+------------------+----------+-----------------+-------------+------------+-----------+
--| name             | firstname| lastname        | matchkey    | middlename | recordid  |
--+------------------+----------+-----------------+-------------+------------+-----------+
--| KATHRYN E ABATE  | KATHRYN  | ABATE           | L           |  E         | 3         | 
--| ANNA  ABAYEV     | ANNA     | ABAYEV          | L           |            | 5         | 
--+------------------+----------+-----------------+-------------+------------+-----------+

-- Sample output data

--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+
--|MatchRecordType|MatchScore|HasDuplicate|CollectionNumber|ExpressMatched|SourceType| name       | firstname| lastname|matchkey|middlename| recordid  |
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+
--|S              |0         |Y           |0-0-1           |              |S         |LAURA ABADSA| LAURA    |ABADSANTO|    L   |          | 1         |
--|D              |80        |D           |0-0-1           |N             |C         |KATHRYN E AB| KATHRYN  |AB       |    L   |    E     | 3         |
--|D              |90        |D           |0-0-1           |N             |C         |ANNA ABAYEV | ANNA     |ABAYEV   |    L   |          | 5         |
--+---------------+----------+------------+----------------+--------------+----------+------------+----------+---------+--------+----------+-----------+