Fixing broken feeder mode, update to ES 1.3.1

jprante · jprante · commit 10a3bf718376 · 2014-08-02T12:53:28.000+02:00
The feeder mode was broken due to wrong classpath control in the example shell scripts
and the river state move into the cluster state. Cluster state is not possible
to use in TransportClient, so a workaround was introduced to let the JDBC plugin
start also in a TransportClient.

More comments added at example shell scripts to clarify the use of feeder mode
in a JVM outside of an Elasticsearch node.

Update to Elasticsearch 1.3.1

An NPE in the river state getting should be fixed.

Dropping unused dependency of hamcrest matchers.
diff --git a/bin/feeder.in.sh b/bin/feeder.in.sh
@@ -0,0 +1,26 @@
+
+# Configuration for Elasticsearch JDBC feeder mechanism
+
+# Java home
+
+# for Mac OS X
+#JAVA_HOME=$(/usr/libexec/java_home -v 1.7*)
+JAVA_HOME=$(/usr/libexec/java_home -v 1.8*)
+# for Linux
+#JAVA_HOME"/etc/alternatives/java"
+
+# Elasticsearch home
+ES_HOME=~es/elasticsearch-1.3.0
+
+# Elasticsearch plugins folder where "jdbc" plugin is installed
+ES_PATH_PLUGINS=${ES_HOME}/plugins
+
+# Classpath for loading JDBC plugin from external Java execution, without other plugins.
+#
+# The classpath is very similar to Elasticsearch classpath, but it must follow these rules:
+# - first, the elasticsearch*.jar in elasticsearch "lib" folder
+# - the other jars in elasticsearch "lib" folder
+# - the plugins/jdbc folder for log4j.properties (or log4j2.xml)
+# - the plugins/jdbc jars (plugin jar and JDBC driver jars)
+# - no more, no other (server-side) plugins etc. !
+ES_JDBC_CLASSPATH=${ES_HOME}/lib/elasticsearch\*:${ES_HOME}/lib/\*:${ES_PATH_PLUGINS}/jdbc:${ES_PATH_PLUGINS}/jdbc/\*
diff --git a/bin/feeder/mysql/create.sh b/bin/feeder/mysql/create.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-java="/usr/bin/java"
-#java="/Library/Java/JavaVirtualMachines/jdk1.8.0.jdk/Contents/Home/bin/java"
-#java="/usr/java/jdk1.8.0/bin/java"
+# This example shows two concurrent feeds from a MySQL database (conncurreny = 2)
+# It is possible to connect to many databases in parallel and fetch data for Elasticsearch.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+. ${DIR}/../../feeder.in.sh
 
 echo '
 {
@@ -39,7 +41,7 @@ echo '
       }
     ]
 }
-' | ${java} \
-    -cp $(pwd):$(pwd)/\*:$(pwd)/../../lib/\* \
+' | ${JAVA_HOME}/bin/java \
+    -cp ${ES_JDBC_CLASSPATH} \
     org.xbib.elasticsearch.plugin.feeder.Runner \
     org.xbib.elasticsearch.plugin.feeder.jdbc.JDBCFeeder
diff --git a/bin/feeder/mysql/geo.dump b/bin/feeder/mysql/geo.dump
diff --git a/bin/feeder/mysql/geo.sh b/bin/feeder/mysql/geo.sh
@@ -1,19 +1,24 @@
 #!/bin/sh
 
-# a complete minimalistic geo "push" example for MySQL geo -> Elasticsearch geo search
+# This example shows a complete minimalistic geo push & search example for MySQL -> Elasticsearch
 
+# - install Elasticsearch
+# - run Elasticsearch
 # - install MySQL in /usr/local/mysql
-# - start MySQL on localhost:3306 (default)
-# - prepare a 'test' database in MySQL
-# - create empty user '' with empty password ''
-# - execute SQL in "geo.dump" /usr/local/mysql/bin/mysql test < src/test/resources/geo.dump
-# - then run this script from $ES_HOME/plugins/jdbc: bash bin/feeder/mysql/geo.sh
+# - start MySQL on localhost:3306
+# - as MySQL root admin, prepare a 'geo' database in MySQL :
+#     CREATE DATABASE geo
+# - as MySQL root admin, create empty user '' with empty password '' :
+#     GRANT ALL PRIVILEGES ON geo.* TO ''@'localhost' IDENTIFIED BY '';
+# - execute SQL in geo.dump
+#     /usr/local/mysql/bin/mysql geo < ./bin/feeder/mysql/geo.dump
+# - then run this script
+#    ./bin/feeder/mysql/geo.sh
 
-curl -XDELETE 'localhost:9200/myjdbc'
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+. ${DIR}/../../feeder.in.sh
 
-java="/usr/bin/java"
-#java="/Library/Java/JavaVirtualMachines/jdk1.8.0.jdk/Contents/Home/bin/java"
-#java="/usr/java/jdk1.8.0/bin/java"
+curl -XDELETE 'localhost:9200/myjdbc'
 
 echo '
 {
@@ -46,8 +51,8 @@ echo '
         }
     }
 }
-' | ${java} \
-    -cp $(pwd):$(pwd)/\*:$(pwd)/../../lib/\* \
+' | ${JAVA_HOME}/bin/java \
+    -cp ${ES_JDBC_CLASSPATH} \
     org.xbib.elasticsearch.plugin.feeder.Runner \
     org.xbib.elasticsearch.plugin.feeder.jdbc.JDBCFeeder
 
@@ -73,3 +78,25 @@ curl -XPOST 'localhost:9200/myjdbc/_search?pretty' -d '
      }
    }
 }'
+
+# Expected result:
+# {"_shards":{"total":2,"successful":1,"failed":0}}{
+#  "took" : 117,
+#  "timed_out" : false,
+#  "_shards" : {
+#    "total" : 1,
+#    "successful" : 1,
+#    "failed" : 0
+#  },
+#  "hits" : {
+#    "total" : 1,
+#    "max_score" : 1.0,
+#    "hits" : [ {
+#      "_index" : "myjdbc",
+#      "_type" : "mytype",
+#      "_id" : "Dom",
+#      "_score" : 1.0,
+#      "_source":{"city":"Köln","zip":"50667","address":"Domkloster 4","location":{"lat":50.9406645,"lon":6.9599115}}
+#    } ]
+#  }
+# }
diff --git a/bin/feeder/oracle/create.sh b/bin/feeder/oracle/create.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-java="/usr/bin/java"
-#java="/Library/Java/JavaVirtualMachines/jdk1.8.0.jdk/Contents/Home/bin/java"
-#java="/usr/java/jdk1.8.0/bin/java"
+# This example is a template to connect to Oracle in feeder mode.
+# The JDBC URL and SQL must be replaced by working ones.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+. ${DIR}/../../feeder.in.sh
 
 echo '
 {
@@ -24,7 +26,7 @@ echo '
         }
     }
 }
-' | ${java} \
-    -cp $(pwd):$(pwd)/\*:$(pwd)/../../lib/\* \
+' | ${JAVA_HOME}/bin/java \
+    -cp ${ES_JDBC_CLASSPATH} \
     org.xbib.elasticsearch.plugin.feeder.Runner \
     org.xbib.elasticsearch.plugin.feeder.jdbc.JDBCFeeder
diff --git a/pom.xml b/pom.xml
@@ -7,12 +7,12 @@
 
     <groupId>org.xbib.elasticsearch.plugin</groupId>
     <artifactId>elasticsearch-river-jdbc</artifactId>
-    <version>1.3.0.1</version>
+    <version>1.3.0.2</version>
 
     <packaging>jar</packaging>
 
     <name>elasticsearch-river-jdbc</name>
-    <description>JDBC River for ElasticSearch</description>
+    <description>JDBC River for Elasticsearch</description>
 
     <url>http://github.com/jprante/elasticsearch-river-jdbc</url>
 
@@ -68,12 +68,11 @@
         </repository>
     </repositories>
 
-
     <properties>
         <github.global.server>github</github.global.server>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-        <java.compile.version>1.7</java.compile.version>
-        <elasticsearch.version>1.3.0</elasticsearch.version>
+        <java.compiler.version>1.7</java.compiler.version>
+        <elasticsearch.version>1.3.1</elasticsearch.version>
     </properties>
 
     <dependencies>
@@ -92,14 +91,6 @@
             <scope>test</scope>
         </dependency>
 
-        <dependency>
-            <groupId>org.hamcrest</groupId>
-            <artifactId>hamcrest-all</artifactId>
-            <version>1.1</version>
-            <type>jar</type>
-            <scope>test</scope>
-        </dependency>
-
         <dependency>
             <groupId>org.apache.logging.log4j</groupId>
             <artifactId>log4j-slf4j-impl</artifactId>
@@ -145,8 +136,8 @@
                 <artifactId>maven-compiler-plugin</artifactId>
                 <version>3.1</version>
                 <configuration>
-                    <source>${java.compile.version}</source>
-                    <target>${java.compile.version}</target>
+                    <source>${java.compiler.version}</source>
+                    <target>${java.compiler.version}</target>
                     <encoding>UTF-8</encoding>
                     <optimize>true</optimize>
                     <showDeprecation>true</showDeprecation>
diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml
@@ -36,6 +36,7 @@
             <directory>${project.basedir}/src/test/resources</directory>
             <outputDirectory>/</outputDirectory>
             <includes>
+                <include>log4j.properties</include>
                 <include>log4j2.xml</include>
             </includes>
         </fileSet>
diff --git a/src/main/java/org/xbib/elasticsearch/action/river/jdbc/state/delete/TransportDeleteRiverStateAction.java b/src/main/java/org/xbib/elasticsearch/action/river/jdbc/state/delete/TransportDeleteRiverStateAction.java
@@ -7,21 +7,22 @@
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ack.ClusterStateUpdateResponse;
 import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.TransportService;
 import org.xbib.elasticsearch.action.river.jdbc.state.RiverStateService;
 
 public class TransportDeleteRiverStateAction extends TransportMasterNodeOperationAction<DeleteRiverStateRequest, DeleteRiverStateResponse> {
 
-    private final RiverStateService riverStateService;
+    private final Injector injector;
 
     @Inject
     public TransportDeleteRiverStateAction(Settings settings, ThreadPool threadPool,
                                            ClusterService clusterService, TransportService transportService,
-                                           RiverStateService riverStateService) {
+                                           Injector injector) {
         super(settings, DeleteRiverStateAction.NAME, transportService, clusterService, threadPool);
-        this.riverStateService = riverStateService;
+        this.injector = injector;
     }
 
     @Override
@@ -41,6 +42,7 @@ protected DeleteRiverStateResponse newResponse() {
 
     @Override
     protected void masterOperation(DeleteRiverStateRequest request, ClusterState state, final ActionListener<DeleteRiverStateResponse> listener) throws ElasticsearchException {
+        RiverStateService riverStateService = injector.getInstance(RiverStateService.class);
         riverStateService.unregisterRiver(new RiverStateService.UnregisterRiverStateRequest("delete_river_state[" + request.getRiverName() + "]", request.getRiverName())
                 .masterNodeTimeout(request.masterNodeTimeout())
                 .ackTimeout(request.ackTimeout()), new ActionListener<ClusterStateUpdateResponse>() {
diff --git a/src/main/java/org/xbib/elasticsearch/action/river/jdbc/state/get/GetRiverStateResponse.java b/src/main/java/org/xbib/elasticsearch/action/river/jdbc/state/get/GetRiverStateResponse.java
@@ -46,6 +46,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
     @Override
     public void readFrom(StreamInput in) throws IOException {
         super.readFrom(in);
+        getRiverStateRequest = new GetRiverStateRequest();
+        getRiverStateRequest.readFrom(in);
         int len = in.readInt();
         ImmutableList.Builder<RiverState> builder = ImmutableList.builder();
         for (int i = 0; i < len; i++) {
@@ -59,6 +61,7 @@ public void readFrom(StreamInput in) throws IOException {
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         super.writeTo(out);
+        getRiverStateRequest.writeTo(out);
         out.writeInt(states.size());
         for (RiverState rs : states) {
             rs.writeTo(out);
diff --git a/src/main/java/org/xbib/elasticsearch/action/river/jdbc/state/put/TransportPutRiverStateAction.java b/src/main/java/org/xbib/elasticsearch/action/river/jdbc/state/put/TransportPutRiverStateAction.java
@@ -7,21 +7,22 @@
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ack.ClusterStateUpdateResponse;
 import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.TransportService;
 import org.xbib.elasticsearch.action.river.jdbc.state.RiverStateService;
 
 public class TransportPutRiverStateAction extends TransportMasterNodeOperationAction<PutRiverStateRequest, PutRiverStateResponse> {
 
-    private final RiverStateService riverStateService;
+    private final Injector injector;
 
     @Inject
     public TransportPutRiverStateAction(Settings settings, ThreadPool threadPool,
                                         ClusterService clusterService, TransportService transportService,
-                                        RiverStateService riverStateService) {
+                                        Injector injector) {
         super(settings, PutRiverStateAction.NAME, transportService, clusterService, threadPool);
-        this.riverStateService = riverStateService;
+        this.injector = injector;
     }
 
     @Override
@@ -41,6 +42,7 @@ protected PutRiverStateResponse newResponse() {
 
     @Override
     protected void masterOperation(PutRiverStateRequest request, ClusterState state, final ActionListener<PutRiverStateResponse> listener) throws ElasticsearchException {
+        RiverStateService riverStateService = injector.getInstance(RiverStateService.class);
         riverStateService.registerRiver(new RiverStateService.RegisterRiverStateRequest("put_river_state[" + request.getRiverName() + "]", request.getRiverName(), request.getRiverType())
                 .riverState(request.getRiverState())
                 .masterNodeTimeout(request.masterNodeTimeout())
diff --git a/src/main/java/org/xbib/elasticsearch/plugin/feeder/AbstractFeeder.java b/src/main/java/org/xbib/elasticsearch/plugin/feeder/AbstractFeeder.java
@@ -14,6 +14,7 @@
 import org.xbib.elasticsearch.action.river.jdbc.state.RiverState;
 import org.xbib.elasticsearch.support.client.Ingest;
 import org.xbib.elasticsearch.support.client.node.NodeClient;
+import org.xbib.io.URIUtil;
 import org.xbib.pipeline.AbstractPipeline;
 import org.xbib.pipeline.Pipeline;
 import org.xbib.pipeline.PipelineException;
@@ -25,6 +26,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
+import java.net.URI;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
@@ -410,4 +412,19 @@ public InputStream getDefaultMapping(String index, String type) {
 
     public abstract void executeTask(Map<String, Object> map) throws Exception;
 
+    public Settings clientSettings(URI connectionSpec) {
+        return settingsBuilder()
+                .put("name", "feeder") // prevents lookup of names.txt, we don't have it, and marks this node as "feeder". See also module load skipping in JDBCRiverPlugin
+                .put("network.server", false) // this is not a server
+                .put("node.client", true) // this is an Elasticearch client
+                .put("cluster.name", URIUtil.parseQueryString(connectionSpec).get("es.cluster.name")) // specified remote ES cluster
+                .put("client.transport.sniff", false) // we do not sniff (should be configurable)
+                .put("client.transport.ignore_cluster_name", false) // respect cluster name setting
+                .put("client.transport.ping_timeout", "30s") // large ping timeout (should not be required)
+                .put("client.transport.nodes_sampler_interval", "30s") // only for sniff sampling
+                .put("path.plugins", ".dontexist") // pointing to a non-exiting folder means, this disables loading site plugins
+                // we do not need to change class path settings when using the "feeder" name trick
+                .build();
+    }
+
 }
diff --git a/src/main/java/org/xbib/elasticsearch/plugin/feeder/jdbc/JDBCFeeder.java b/src/main/java/org/xbib/elasticsearch/plugin/feeder/jdbc/JDBCFeeder.java
@@ -111,12 +111,14 @@ public Feeder<T, R, P> beforeRun() throws IOException {
                     Runtime.getRuntime().availableProcessors());
             ByteSizeValue maxvolume = settings.getAsBytesSize("maxbulkvolume", ByteSizeValue.parseBytesSizeValue("10m"));
             TimeValue maxrequestwait = settings.getAsTime("maxrequestwait", TimeValue.timeValueSeconds(60));
-            ingest = new BulkTransportClient();
+            BulkTransportClient ingest = new BulkTransportClient();
+            URI connSpec = URI.create(settings.get("elasticsearch"));
             ingest.maxActionsPerBulkRequest(maxbulkactions)
                     .maxConcurrentBulkRequests(maxconcurrentbulkrequests)
                     .maxVolumePerBulkRequest(maxvolume)
-                    .maxRequestWait(maxrequestwait);
-            ingest.newClient(URI.create(settings.get("elasticsearch")));
+                    .maxRequestWait(maxrequestwait)
+                    .newClient(connSpec, clientSettings(connSpec));
+            this.ingest = ingest;
         }
         // create queue
         super.beforeRun();
diff --git a/src/main/java/org/xbib/elasticsearch/plugin/river/jdbc/JDBCRiverPlugin.java b/src/main/java/org/xbib/elasticsearch/plugin/river/jdbc/JDBCRiverPlugin.java
@@ -3,6 +3,7 @@
 import org.elasticsearch.action.ActionModule;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.Module;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.plugins.AbstractPlugin;
 import org.elasticsearch.rest.RestModule;
 import org.elasticsearch.river.RiversModule;
@@ -24,8 +25,11 @@
 
 public class JDBCRiverPlugin extends AbstractPlugin {
 
+    private final Settings settings;
+
     @Inject
-    public JDBCRiverPlugin() {
+    public JDBCRiverPlugin(Settings settings) {
+        this.settings = settings;
     }
 
     @Override
@@ -43,7 +47,10 @@ public String description() {
     @Override
     public Collection<Class<? extends Module>> modules() {
         Collection<Class<? extends Module>> modules = newArrayList();
-        modules.add(RiverStateModule.class);
+        // if we are in "feeder" node mode, we skip initiating the server-side only river state module
+        if (!"feeder".equals(settings.get("name"))) {
+            modules.add(RiverStateModule.class);
+        }
         return modules;
     }
 
diff --git a/src/main/java/org/xbib/elasticsearch/support/client/bulk/BulkTransportClient.java b/src/main/java/org/xbib/elasticsearch/support/client/bulk/BulkTransportClient.java
@@ -96,15 +96,6 @@ public BulkTransportClient newClient(Client client) {
         return this.newClient(findURI());
     }
 
-    /**
-     * Create a new client
-     *
-     * @return this client
-     */
-    public BulkTransportClient newClient() {
-        return this.newClient(findURI());
-    }
-
     /**
      * Create new client
      * The URI describes host and port of the node the client should connect to,
diff --git a/src/main/java/org/xbib/io/URIUtil.java b/src/main/java/org/xbib/io/URIUtil.java
diff --git a/src/test/java/org/xbib/elasticsearch/river/jdbc/strategy/simple/RiverScheduleTests.java b/src/test/java/org/xbib/elasticsearch/river/jdbc/strategy/simple/RiverScheduleTests.java
diff --git a/src/test/java/org/xbib/elasticsearch/support/helper/AbstractNodeTestHelper.java b/src/test/java/org/xbib/elasticsearch/support/helper/AbstractNodeTestHelper.java
diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties

Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,10 @@`
`1`	`1`	`#!/bin/sh`
`2`	`2`
`3`		`-java="/usr/bin/java"`
`4`		`-#java="/Library/Java/JavaVirtualMachines/jdk1.8.0.jdk/Contents/Home/bin/java"`
`5`		`-#java="/usr/java/jdk1.8.0/bin/java"`
	`3`	`+# This example shows two concurrent feeds from a MySQL database (conncurreny = 2)`
	`4`	`+# It is possible to connect to many databases in parallel and fetch data for Elasticsearch.`
	`5`	`+`
	`6`	`+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"`
	`7`	`+. ${DIR}/../../feeder.in.sh`
`6`	`8`
`7`	`9`	`echo '`
`8`	`10`	`{`
`@@ -39,7 +41,7 @@ echo '`
`39`	`41`	`}`
`40`	`42`	`]`
`41`	`43`	`}`
`42`		`-' \| ${java} \`
`43`		`- -cp $(pwd):$(pwd)/\:$(pwd)/../../lib/\ \`
	`44`	`+' \| ${JAVA_HOME}/bin/java \`
	`45`	`+ -cp ${ES_JDBC_CLASSPATH} \`
`44`	`46`	`org.xbib.elasticsearch.plugin.feeder.Runner \`
`45`	`47`	`org.xbib.elasticsearch.plugin.feeder.jdbc.JDBCFeeder`