求两个字符串列表或者数组之间的交集,差集,并集,补集。
import org.apache.commons.collections.CollectionUtils; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import java.util.Arrays; import java.util.HashSet; import java.util.List; /** * @param * 格式二:求两个字符串列表的交集、补集、差集、并集 * 输出两个字符串中相同的元素个数、元素值、元素值的长度 * 如执行: * supportArr(1,'.','1.2.3.4.5.6','3.4.5.6.7',1),则本udf输出:4 * 传入五个参数: * 参数一:大类型:1,交集;2,补集;3,差集;4,并集 * 参数二:字符串中的分隔符, * 参数三:字符串一:里面是以分隔符连接的字符串 * 参数四:字符串二:里面是以分隔符连接的字符串 * 参数五:返回类型:1,元素个数;2,元素的字符串值;3,元素的字符串的长度 * * //废弃 输出两个字符串中相同的元素个数,如执行: * //废弃 supportArr(',','1,2,3,4,5,6','3,4,5,6,7'),则本udf输出:4 * //废弃 传入三个参数: * //废弃 参数一:字符串中的分隔符, * //废弃 参数二:字符串一:里面是以分隔符连接的字符串 * //废弃 参数三:字符串二:里面是以分隔符连接的字符串 * */ public class UDFSupportArr extends UDF{ String[] str1 = null; HashSet<String> hset1 = null; StringBuilder sb = new StringBuilder(); List<String> l1; List<String> l2; List<String> l3; Text result = new Text(); // 废弃 // public Text evaluate(Text sep, Text arr1, Text arr2) // { // if (arr1 == null || arr2 == null) // { // return null; // } // // str1 = (arr1.toString() + sep.toString() + arr2.toString()).split(sep.toString()); // // hset1 = new HashSet<String>(Arrays.asList(str1)); // // result.set(String.valueOf(str1.length - hset1.size())); // // return result; // } public Text evaluate(IntWritable btype, Text tsep, Text arr1, Text arr2, IntWritable type) { if (arr1 == null || arr2 == null) { return null; } if (btype.get() == 1) { // 交集 return intersection(tsep.toString(), arr1.toString(), arr2.toString(), type.get()); } else if (btype.get() == 2) { // 补集 return disjunction(tsep.toString(), arr1.toString(), arr2.toString(), type.get()); } else if (btype.get() == 3) { // 差集 return subtract(tsep.toString(), arr1.toString(), arr2.toString(), type.get()); } else if (btype.get() == 4) { // 并集:即union(去重),不是union all(不去重) return union(tsep.toString(), arr1.toString(), arr2.toString(), type.get()); } return null; } // 交集 @SuppressWarnings("unchecked") public Text intersection(String tsep, String arr1, String arr2, int type) { l1 = Arrays.asList(arr1.split(tsep)); l2 = Arrays.asList(arr2.split(tsep)); l3 = (List<String>) CollectionUtils.intersection(l1, l2); // 返回交集的个数 if (type == 1) { sb.setLength(0); sb.append(l3.size()); } // 返回交集的字符串 else if (type == 2) { sb.setLength(0); for (String str : l3) { sb.append(str + tsep); } if (sb.length() != 0) sb.deleteCharAt(sb.length() - 1); } // 返回交集的字符串的长度 else if (type == 3) { sb.setLength(0); for (String str : l3) { sb.append(str.length() + tsep); } if (sb.length() != 0) sb.delete(sb.length() - tsep.length(), sb.length()); } else { return null; } result.set(sb.toString()); sb.setLength(0); return result; } // 补集 @SuppressWarnings("unchecked") public Text disjunction(String tsep, String arr1, String arr2, int type) { l1 = Arrays.asList(arr1.split(tsep)); l2 = Arrays.asList(arr2.split(tsep)); l3 = (List<String>) CollectionUtils.disjunction(l1, l2); // 返回补集的个数 if (type == 1) { sb.setLength(0); sb.append(l3.size()); } // 返回补集的字符串 else if (type == 2) { sb.setLength(0); for (String str : l3) { if (str.length() > 0) sb.append(str + tsep); } sb.deleteCharAt(sb.length() - 1); } // 返回补集的字符串的长度 else if (type == 3) { sb.setLength(0); for (String str : l3) { if (str.length() > 0) sb.append(str.length() + tsep); } sb.delete(sb.length() - tsep.length(), sb.length()); } else { return null; } result.set(sb.toString()); sb.setLength(0); return result; } // 差集 @SuppressWarnings("unchecked") public Text subtract(String tsep, String arr1, String arr2, int type) { l1 = Arrays.asList(arr1.split(tsep)); l2 = Arrays.asList(arr2.split(tsep)); l3 = (List<String>) CollectionUtils.subtract(l1, l2); // 返回差集的个数 if (type == 1) { sb.setLength(0); sb.append(l3.size()); } // 返回差集的字符串 else if (type == 2) { sb.setLength(0); for (String str : l3) { sb.append(str + tsep); } if (sb.length() > 0) sb.delete(sb.length() - tsep.length(), sb.length()); } // 返回差集的字符串的长度 else if (type == 3) { sb.setLength(0); for (String str : l3) { sb.append(str.length() + tsep); } sb.delete(sb.length() - tsep.length(), sb.length()); } else { return null; } result.set(sb.toString()); sb.setLength(0); return result; } // 并集:即union(去重),不是union all(不去重) @SuppressWarnings("unchecked") public Text union(String tsep, String arr1, String arr2, int type) { l1 = Arrays.asList(arr1.split(tsep)); l2 = Arrays.asList(arr2.split(tsep)); l3 = (List<String>) CollectionUtils.union(l1, l2); // 返回并集的个数 if (type == 1) { sb.setLength(0); sb.append(l3.size()); } // 返回并集的字符串 else if (type == 2) { sb.setLength(0); for (String str : l3) { if (str.length() > 0) sb.append(str + tsep); } sb.deleteCharAt(sb.length() - 1); } // 返回并集的字符串的长度 else if (type == 3) { sb.setLength(0); for (String str : l3) { if (str.length() > 0) sb.append(str.length() + tsep); } sb.delete(sb.length() - tsep.length(), sb.length()); } else { return null; } result.set(sb.toString()); sb.setLength(0); return result; } public static void main(String[] args) { UDFSupportArr a = new UDFSupportArr(); Text sep = new Text(","); Text v1 = new Text("a,b,c,d"); Text v2 = new Text("a,b,e,f"); //废弃 System.out.println("废弃-个数:" + a.evaluate(sep, v1, v2)); System.out.println(); System.out.println("交集-个数:" + a.evaluate(new IntWritable(1), sep, v1, v2, new IntWritable(1))); System.out.println("交集-字符:" + a.evaluate(new IntWritable(1), sep, v1, v2, new IntWritable(2))); System.out.println("交集-长度:" + a.evaluate(new IntWritable(1), sep, v1, v2, new IntWritable(3))); System.out.println(); System.out.println("补集-个数:" + a.evaluate(new IntWritable(2), sep, v1, v2, new IntWritable(1))); System.out.println("补集-字符:" + a.evaluate(new IntWritable(2), sep, v1, v2, new IntWritable(2))); System.out.println("补集-长度:" + a.evaluate(new IntWritable(2), sep, v1, v2, new IntWritable(3))); System.out.println(); System.out.println("差集-个数:" + a.evaluate(new IntWritable(3), sep, v1, v2, new IntWritable(1))); System.out.println("差集-字符:" + a.evaluate(new IntWritable(3), sep, v1, v2, new IntWritable(2))); System.out.println("差集-长度:" + a.evaluate(new IntWritable(3), sep, v1, v2, new IntWritable(3))); System.out.println(); System.out.println("并集-个数:" + a.evaluate(new IntWritable(4), sep, v1, v2, new IntWritable(1))); System.out.println("并集-字符:" + a.evaluate(new IntWritable(4), sep, v1, v2, new IntWritable(2))); System.out.println("并集-长度:" + a.evaluate(new IntWritable(4), sep, v1, v2, new IntWritable(3))); } } 测试数据结果如下: 交集-个数:2 交集-字符:a,b 交集-长度:1,1 补集-个数:4 补集-字符:c,d,e,f 补集-长度:1,1,1,1 差集-个数:2 差集-字符:c,d 差集-长度:1,1 并集-个数:6 并集-字符:a,b,c,d,e,f 并集-长度:1,1,1,1,1,1