点击蓝字 关注我们
Flink之DataSet转换操作2(九)
01
TransForm之Reduce操作
通过两两合并将数据集中的元素合并成一个元素,可以在整个数据集上使用。
■将数据合并
val dataSet = env.fromElements(1,2,3,4)
val results = dataSet.reduce((x,y) => (x+y))
//Java
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class ReduceJavaDemo {
public static void main(String[] args) throws Exception{
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<String> source = env.fromElements(
"spark hbase java",
"java spark hive",
"java hbase hbase"
);
// DataSet
// @Override
// public void flatMap(String line, Collector
// for(String word : line.toUpperCase().split(" ")){
// collector.collect(new Tuple2<>(word , 1));
// }
// }
// })
// .groupBy("f0")
// .reduce(new ReduceFunction
// @Override
// public Tuple2
// return new Tuple2<>(t1.f0,t1.f1+t2.f1);
// }
// });
DataSet<Tuple2<String,Integer>> result = source.flatMap((String line , Collector<Tuple2<String,Integer>> collector) -> {
for(String word : line.toUpperCase().split(" ")){
collector.collect(new Tuple2<>(word,1));
}
}).returns(Types.TUPLE(Types.STRING,Types.INT))
.groupBy("f0")
.reduce((x , y) -> new Tuple2<>(x.f0,x.f1+y.f1));
result.print();
}
}
//Scala
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
import org.apache.flink.util.Collector
object ReduceScalaDemo {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val source = env.fromElements(
"spark hbase java",
"java spark hive",
"java hbase hbase"
)
val result = source
.flatMap((line : String,collector : Collector[(String,Int)]) => {
(line.toUpperCase.split(" ")).foreach(word => (collector.collect((word,1))))
})
.groupBy("_1")
.reduce((x,y) => (x._1,x._2+y._2))
.print()
}
}
02
TransForm之ReduceGroup操作
将一组元素合并成一个或者多个元素,可以在整个数据集上使用。这是对reduce程序的一个小优化。
■优化数据合并
val dataSet = env.fromElements(1,2,3,4)
val results = dataSet.reduceFroup(in => in reduce(x,y) => (x+y))
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class ReduceGroupJavaDemo {
public static void main(String[] args) throws Exception{
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<String> source = env.fromElements(
"spark hbase java",
"java spark hive",
"java hbase hbase"
);
// DataSet
// .flatMap(new FlatMapFunction
// @Override
// public void flatMap(String line, Collector
// for(String word : line.toUpperCase().split(" ")){
// collector.collect(new Tuple2<>(word,1));
// }
// }
// })
// .groupBy("f0")
// .reduceGroup(new GroupReduceFunction
// @Override
// public void reduce(Iterable
// String word = null;
// int count = 0;
// for(Tuple2
// word = tuple.f0;
// count += tuple.f1;
// }
// collector.collect(new Tuple2<>(word,count));
// }
// });
DataSet
.flatMap((String line ,Collector
for(String word : line.toUpperCase().split(" ")){
collector.collect(new Tuple2<>(word,1));
}
}).returns(Types.TUPLE(Types.STRING,Types.INT))
.groupBy("f0")
.reduceGroup((Iterable
String word = null;
int count = 0;
for(Tuple2<String,Integer> tuple : iterable){
word = tuple.f0;
count += tuple.f1;
}
collector.collect(new Tuple2<>(word,count));
}).returns(Types.TUPLE(Types.STRING,Types.INT));
result.print();
}
}
03
TransForm之CombineGroup操作
我们可以通过CombineGroup事先在每一台机器上进行聚合操作,再通过ReduceGroup将每台机器CombineGroup输出的结果进行聚合,
这样的话,ReduceGroup需要汇总的数据量就少很多,从而加快计算的速度。
■优化数据合并
.groupBy("_1")
.combineGroup((words , out : Collector[(String , Int)]) =>{
out.collect(words reduce((x,y)=> (х. _1,x. _2+y. 2))
})
.groupBy("_1")
.reduceGroup(x => x.reduce((x,y)=> (x.1,x._2+y._2))
//Java
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.GroupCombineFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class CombineGroupJavaDemo {
public static void main(String[] args) throws Exception{
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<String> source = env.fromElements(
"spark hbase java",
"java spark hive",
"java hbase hbase"
);
// DataSet
// .flatMap(new FlatMapFunction
// @Override
// public void flatMap(String line, Collector
// for(String word : line.toUpperCase().split(" ")){
// collector.collect(new Tuple2<>(word,1));
// }
// }
// })
//
// //本地上进行聚合操作
// .groupBy("f0")
// .combineGroup(new GroupCombineFunction
// @Override
// public void combine(Iterable
// String word = null;
// int count =0;
// for(Tuple2
// word = tuple.f0;
// count += tuple.f1;
// }
// collector.collect(new Tuple2<>(word,count));
// }
// })
// .groupBy("f0")
// .reduceGroup(new GroupReduceFunction
// @Override
// public void reduce(Iterable
// String word = null;
// int count = 0;
// for(Tuple2
// word = tuple.f0;
// count += tuple.f1;
// }
// collector.collect(new Tuple2<>(word,count));
// }
// });
DataSet
.flatMap((String line , Collector
for(String word : line.toUpperCase().split(" ")){
collector.collect(new Tuple2<>(word,1));
}
}).returns(Types.TUPLE(Types.STRING,Types.INT))
.groupBy("f0")
.combineGroup((Iterable
String word = null;
int count = 0;
for(Tuple2<String,Integer> tuple : iterable){
word = tuple.f0;
count += tuple.f1;
}
collector.collect(new Tuple2<>(word,count));
}).returns(Types.TUPLE(Types.STRING,Types.INT))
.groupBy("f0")
.reduceGroup((Iterable
String word = null;
int count = 0 ;
for(Tuple2<String,Integer> tuple : iterable){
word =tuple.f0;
count += tuple.f1;
}
collector.collect(new Tuple2<>(word,count));
}).returns(Types.TUPLE(Types.STRING,Types.INT));
result.print();
}
}
//Scala
import org.apache.flink.api.scala.{ExecutionEnvironment,_}
import org.apache.flink.util.Collector
object CombineGroupScalaDemo {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val source = env.fromElements(
"spark hbase java",
"java spark hive",
"java hbase hbase"
)
val result = source
.flatMap((line : String , collector : Collector[(String,Int)]) => {
line.toUpperCase.split(" ").foreach(word => (collector.collect(word,1)))
})
.groupBy("_1")
.combineGroup((iterator , combine_collector : Collector[(String,Int)] )=> {
combine_collector.collect(iterator reduce((x,y) => (x._1,x._2+y._2)))
})
.groupBy("_1")
.reduceGroup((iterator , collector : Collector[(String,Int)]) => {
collector.collect(iterator reduce((x,y) => (x._1,x._2+y._2)))
}).print()
}
}
04
TransForm之Aggregate操作
通过Aggregate Function将一组元素值合并成单个值,可以在整个DataSet数据集上使用通过Aggregate Function将一组元素值合并成单个值,可以在整个DataSet数据集上使用。
■取合并后数值的最大值
.group("_1").aggregate(Aggregations.SUM,1)
.group("_1").aggregate(Aggregations.SUM,1).max(1)
.group("_1").aggregate(Aggregations.SUM,1).maxby(1)
//Java
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.aggregation.Aggregations;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class AggregateJavaDemo {
public static void main(String[] args) throws Exception{
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<String> source = env.fromElements(
"spark hbase java",
"java spark hive",
"java hbase hbase"
);
DataSet
.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String s, Collector
for(String word : s.toUpperCase().split(" ")){
collector.collect(new Tuple2<>(word,1));
}
}
})
.groupBy("f0")
.aggregate(Aggregations.SUM,1);
result.print();
}
}
扫描二维码
关注我们
微信号 : BIGDT_IN