作者:他像强盗霸占了d我的心 | 来源:互联网 | 2024-11-05 15:08
本文详细探讨了HBase1.2.6版本中JavaAPI的高级应用,重点介绍了过滤器的使用方法和实际案例。首先,文章对几种常见的HBase过滤器进行了概述,包括列前缀过滤器(ColumnPrefixFilter)和时间戳过滤器(TimestampsFilter)。此外,还详细讲解了分页过滤器(PageFilter)的实现原理及其在大数据查询中的应用场景。通过具体的代码示例,读者可以更好地理解和掌握这些过滤器的使用技巧,从而提高数据处理的效率和灵活性。
HBase版本:1.2.6
1. HBase过滤器简介
(1) 过滤器简介
过滤器 | 解释 |
---|
ColumnPrefixFilter | 列前缀过滤器 |
TimestampsFilter | 时间戳过滤器 |
PageFilter | 分页过滤器 |
MultipleColumnPrefixFilter | 复合列前缀过滤器 |
FamilyFilter | 列簇过滤器 |
ColumnPaginationFilter |
SingleColumnValueFilter | 单列值过滤器 |
RowFilter | 行健过滤器 |
QualifierFilter | 列过滤器 |
ColumnRangeFilter |
ValueFilter | 值过滤器 |
PrefixFilter | 前缀过滤器 |
SingleColumnValueExcludeFilter | 单列值排除器 |
ColumnCountGetFilter |
InclusiveStopFilter |
DependentColumnFilter |
FirstKeyOnlyFilter |
KeyOnlyFilter |
(2) 过滤器分类
类别 | 过滤器 |
---|
比较过滤器 | RowFilter、FamilyFilter、QualifierFilter、ValueFilter 等 |
专用过滤器 | SingleColumnValueFilter、SingleColumnValueExcludeFilter、PrefixFilter、ColumnPrefixFilter、PageFilter 等 |
(3) 运算符种类
运算符 | 说明 |
---|
LESS | < |
LESS_OR_EQUAL | <= |
EQUAL | = |
NOT_EQUAL | != |
GREATER_OR_EQUAL | >= |
GREATER | > |
NO_OP | 没有运算符 |
(4) 比较器种类
比较器 | 说明 |
---|
BinaryComparator | 按字节索引顺序比较指定字节数组,采用 Bytes.compareTo(byte[]) |
BinaryPrefixComparator | 跟前面相同,只是比较左端的数据是否相同 |
BitComparator | 按位比较 |
LongComparator | 比较long型value |
NullComparator | 判断给定value的是否为空 |
RegexStringComparator | 提供一个正则的比较器,仅支持 EQUAL 和 NOT_EQUAL 运算符 |
SubstringComparator | 判断提供的子串是否出现在 value 中 |
(5) 使用比较过滤器的方法
2. 常见过滤器API
package com.aura.hbase.test;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.MultipleColumnPrefixFilter;
import org.apache.hadoop.hbase.filter.PageFilter;
import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.apache.hadoop.hbase.filter.QualifierFilter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueExcludeFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.filter.SubstringComparator;
import org.apache.hadoop.hbase.filter.ValueFilter;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.ColumnPrefixFilter;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.FamilyFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.Test;
import com.aura.hbase.utils.HBasePrintUtil;
public class HBaseFilterTest { public static final String ZOOKEEPER_LIST = "node01:2181,node02:2181,node03:2181";
public static final String TABLE_NAME = "user_info";
public static Configuration cOnf= null;
public static Admin admin = null;
public static Table table = null; static {
cOnf= HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", ZOOKEEPER_LIST);
try {
Connection cOnn= ConnectionFactory.createConnection(conf);
admin = conn.getAdmin();
table = conn.getTable(TableName.valueOf(TABLE_NAME));
} catch (IOException e) {
e.printStackTrace();
}
} /*
* 扫描全表,查寻指定列族的记录
*/
@Test
public void testScanWithFamily() throws Exception {
Scan scan = new Scan();
scan.addFamily("base_info".getBytes());
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 扫描全表,查寻指定列族、指定列的记录
*/
@Test
public void testScanWithColumn() throws Exception {
Scan scan = new Scan();
scan.addColumn("base_info".getBytes(), "name".getBytes());
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 扫描全表,查寻指定时间戳或指定时间戳范围的记录
*/
@Test
public void testScanWithTimestamp() throws Exception {
Scan scan = new Scan();
// 指定时间戳,查出一条
// scan.setTimeStamp(1514443301587L);
// 指定时间戳范围,查出一条或多条
scan.setTimeRange(1514443301340L, 1514443301587L);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 扫描全表,查寻指定rowkey或rowkey范围的记录
*/
@Test
public void testScanWithRowkey() throws Exception {
Scan scan = new Scan();
scan.setStartRow("baiyc_20150716_0003".getBytes());
scan.setStopRow("baiyc_20150716_0005".getBytes());
/*
* 如果只设置了 startRow,就查询从startRow到表末尾的记录(不包括表最后的rowkey所在的那一行记录)
* 如果只设置了 stopRow,就查询从表开头到stopRow的记录(不包括stopRow的那一行记录)
*/
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试RowFilter
* 扫描全表,查寻rowkey小于等于"baiyc_20150716_0003"的记录
*/
@Test
public void testRowFilter() throws Exception {
Scan scan = new Scan();
Filter filter = new RowFilter(CompareOp.LESS_OR_EQUAL, new BinaryComparator(Bytes.toBytes("baiyc_20150716_0003")));
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试FamilyFilter
* 扫描全表,查寻列簇大于"base_info"的记录
*/
@Test
public void testFamilyFilter() throws Exception {
Scan scan = new Scan();
Filter filter = new FamilyFilter(CompareOp.GREATER, new BinaryComparator(Bytes.toBytes("base_info")));
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试QualifierFilter
* 扫描全表,查寻列名等于"name"的记录
*/
@Test
public void testQualifierFilter() throws Exception {
Scan scan = new Scan();
Filter filter = new QualifierFilter(CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes("name")));
/*
* BinaryComparator比较器:匹配完全等值的列名
* Filter filter = new QualifierFilter(CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes("name")));
*
* BinaryPrefixComparator比较器:匹配列名的前缀为"na"的记录
* Filter filter = new QualifierFilter(CompareOp.EQUAL, new BinaryPrefixComparator(Bytes.toBytes("na")));
*
* RegexStringComparator比较器:匹配列名满足正则表达式"na."的记录
* Filter filter = new QualifierFilter(CompareOp.EQUAL, new RegexStringComparator("na."));
*/
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试ValueFilter
* 扫描全表,查寻列的值中包含"mus"子串的记录
*/
@Test
public void testValueFilter() throws Exception {
Scan scan = new Scan();
Filter filter = new ValueFilter(CompareOp.EQUAL, new SubstringComparator("mus"));
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试FilterList
* 同时添加多个过滤器
*/
@Test
public void testFilterList() throws Exception {
Scan scan = new Scan();
Filter filter1 = new FamilyFilter(CompareOp.GREATER, new BinaryComparator(Bytes.toBytes("base_info")));
Filter filter2 = new ValueFilter(CompareOp.NOT_EQUAL, new BinaryComparator(Bytes.toBytes("music")));
FilterList list = new FilterList(filter1, filter2);
scan.setFilter(list);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试PageFilter
* 分页过滤器,从指定rowkey开始,显示指定的条数
*/
@Test
public void testPageFilter() throws Exception {
Scan scan = new Scan();
// 设置每页显示4页
Filter filter = new PageFilter(4);
// 设置起始的rowkey
scan.setStartRow("baiyc_20150716_0003".getBytes());
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试SingleColumnValueFilter:单列值过滤器,会返回满足条件的整行
* 扫描全表,查询列族为"base_info",列名为"name",且列值包括"zhangsan"子串的所有行
*/
@Test
public void testSingleColumnValueFilter() throws Exception {
Scan scan = new Scan();
SingleColumnValueFilter filter = new SingleColumnValueFilter(
Bytes.toBytes("base_info"),
Bytes.toBytes("name"),
CompareOp.EQUAL,
new SubstringComparator("zhangsan"));
/*
* 如果不设置为 true,则那些不包含指定column的行也会返回
* 比如,现在有一行它没有"name"这个列,它的所有的列值中也不包括"shangsan"这个子串,那么这一行也会返回
* 设置为true,只会返回那些有"name"这个列,并且满足过滤条件的行
*/
filter.setFilterIfMissing(true);
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试SingleColumnValueExcludeFilter:单列值排除器,返回排除了该列的结果
* 与上面的过滤器查询出来的行相同,但不打印"name"那一列
*/
@Test
public void testSingleColumnValueExcludeFilter() throws Exception {
Scan scan = new Scan();
SingleColumnValueExcludeFilter filter = new SingleColumnValueExcludeFilter(
Bytes.toBytes("base_info"),
Bytes.toBytes("name"),
CompareOp.EQUAL,
new SubstringComparator("zhangsan"));
filter.setFilterIfMissing(true);
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试PrefixFilter:前缀过滤器,针对行键
* 扫描全表:查询rowkey的前缀为"baiyc"的全部行
*/
@Test
public void testPrefixFilter() throws Exception {
Scan scan = new Scan();
Filter filter = new PrefixFilter(Bytes.toBytes("baiyc"));
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试ColumnPrefixFilter:列前缀过滤器
* 扫描全表:查询列名的前缀为"na"的全部记录
*/
@Test
public void testColumnPrefixFilter() throws Exception {
Scan scan = new Scan();
Filter filter = new ColumnPrefixFilter(Bytes.toBytes("na"));
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
} /*
* 测试MultipleColumnPrefixFilter:基于列名设置多个前缀过滤数据
* 扫描全表:查询列名的前缀为"na"和列名的前缀为"ag"的全部记录
*/
@Test
public void testMultipleColumnPrefixFilter() throws Exception {
Scan scan = new Scan();
byte[][] prefixes = new byte[][] {Bytes.toBytes("na"), Bytes.toBytes("ag")};
Filter filter = new MultipleColumnPrefixFilter(prefixes);
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
HBasePrintUtil.printResultScanner(scanner);
}
}