最近项目中用到elasticsearch,一款基于Lucene的优秀的搜索引擎,为方便快速入手操作,在此总结了常见的基于curl的crud操作以及利用 spark 将数据导入到es的实例:
#检索文档
curl -XGET
#创建Indix
curl -XPUT ‘mbdt1:9200/customer?pretty’
#列出Indix
curl ‘mbdt2:9200/_cat/indices?v’
#更新document,doc指定的是文档内容
curl -XPOST ‘localhost:9200/customer/external/1/_update?pretty’ -d ‘
{
“doc”: { “name”: “Jane Doe” }
}’
#执行动态脚本
curl -XPOST ‘localhost:9200/customer/external/1/_update?pretty’ -d ‘
{
” script ” : “ctx._source.age += 5”
}’
#”scripts of type [ inline ], operation [update] and lang [groovy] are disabled”
#有可能出现上面的错误, Elasticsearch开启groovy动态语言支持,相关链接:
### 完全开启
#编辑“`config/elasticsearch.yml“`文件,在最后添加以下代码
script.inline: on
script.indexed: on
script.file: on
### 沙盒中开启
#编辑“`config/elasticsearch.yml“`文件,在最后添加以下代码
script.inline: sandbox
script.indexed: sandbox
script.file: on
#删除文档
curl -XDELETE ‘localhost:9200/customer/external/2?pretty’
##批量操作
#创建
curl -XPOST ‘localhost:9200/customer/external/_bulk?pretty’ -d ‘
{“index”:{“_id”:”1″}}
{“name”: “John Doe” }
{“index”:{“_id”:”2″}}
{“name”: “Jane Doe” }
‘
#用create来创建
curl -XPOST ‘mbdt1:9200/_bulk?pretty’ -d ‘
{ “create”: { “_index”: “index1”, “_type”: “resource”, “_id”: 13 } }
{ “title”: “…..” }
‘
‘
#多操作
curl -XPOST ‘localhost:9200/customer/external/_bulk?pretty’ -d ‘
{“update”:{“_id”:”1″}}
{“doc”: { “name”: “John Doe becomes Jane Doe” } }
{“delete”:{“_id”:”2″}}
‘
#本地文件批量导入
curl -XPOST ‘localhost:9200/bank/account/_bulk?pretty’ –data-binary “@accounts.json”
curl ‘localhost:9200/_cat/indices?v’
#查询,全匹配
curl -XPOST ‘localhost:9200/bank/_search?pretty’ -d ‘
{
“query”: { “match_all”: {} }
}’
#排序
curl -XPOST ‘mbdt1:9200/bank/_search?pretty’ -d ‘
{
“query”: { “match_all”: {} },
“_source”: [“account_number”, “balance”],
“sort”: {“balance”: {“order”:”desc”}},
“from”: 10,
“size”:3
}’
#字段匹配
curl -XGET mbdt2:9200/z_wo_order/record/_search?pretty -d ‘{
“query”: {
“match”: {
“cust_name”: {
“query”: “黄利欢”,
“operator”: “and”
}
}
}
}’
#组合查询
GET /my_index/my_type/_search
{
“query”: {
“bool”: {
“must”: { “match”: { “title”: “quick” }},
“must_not”: { “match”: { “title”: “lazy” }},
“should”: [
{ “match”: { “title”: “brown” }},
{ “match”: { “title”: “dog” }}
]
}
}
}
#控制精度
GET /my_index/my_type/_search
{
“query”: {
“match”: {
“title”: {
“query”: “quick brown dog”,
“minimum_should_match”: “75%”
}
}
}
}
利用spark导入数据到es:
#调用elasticsearch-hadoop中的spark模块导入数据
命令:
./spark-shell –jars /home/ouguangneng/elasticsearch-hadoop-2.2.0/dist/elasticsearch-spark_2.10-2.2.0.jar –conf spark.es.nodes=xxx2 –conf spark.es.port=9200 –master yarn-client –num-executors 10 –driver-memory 4g –executor-memory 3g –executor-cores 4
#基于 rdd 导入数据
import org.elasticsearch.spark._
val numbers = Map(“one” -> 1, “two” -> 2, “three” -> 3)
val airports = Map(“arrival” -> “Otopeni”, “SFO” -> “San Fran”)
sc.makeRDD(Seq(numbers, airports)).saveToEs(“test/ext”)
#基于RDD导入数据,并指定document id
import org.elasticsearch.spark._
import org.elasticsearch.spark.rdd.EsSpark
case class Trip( oid : String, departure: String, arrival: String)
val upcomingTrip = Trip(“1”, “OTP”, “SFO”)
val lastWeekTrip = Trip(“2”, “MUC”, “OTP”)
val rdd = sc.makeRDD(Seq(upcomingTrip, lastWeekTrip))
EsSpark.saveToEs(rdd, “test/ext”, Map(“es.mapping.id” -> “oid”))
#基于spark sql的dataframe导入数据
import org.elasticsearch.spark.sql._
import org.apache.spark.sql.hive.HiveContext
val hiveCOntext= new HiveContext(sc)
val df = sql(“select * from tmp.z_wo_order limit 50”)
df.saveToEs(“z_wo_order/record”, Map(“es.mapping.id” -> “order_id”))