diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java index e5f6f2d527..7177f03d1a 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java @@ -181,7 +181,7 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd } } - private String referencedByAlias(String collection, Aliases aliases) { + public static String referencedByAlias(String collection, Aliases aliases) { Objects.requireNonNull(aliases); return aliases.getCollectionAliasListMap().entrySet().stream() .filter(e -> e.getValue().contains(collection)) diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java index de7b3ebb38..a1bd8268b4 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java @@ -241,6 +241,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler, .put(DELETEREPLICA, new DeleteReplicaCmd(this)) .put(ADDREPLICA, new AddReplicaCmd(this)) .put(MOVEREPLICA, new MoveReplicaCmd(this)) + .put(REINDEXCOLLECTION, new ReindexCollectionCmd(this)) .put(UTILIZENODE, new UtilizeNodeCmd(this)) .build() ; diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java new file mode 100644 index 0000000000..553c4bf7ba --- /dev/null +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java @@ -0,0 +1,824 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.cloud.api.collections; + +import java.lang.invoke.MethodHandles; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.http.client.HttpClient; +import org.apache.solr.client.solrj.SolrResponse; +import org.apache.solr.client.solrj.cloud.DistribStateManager; +import org.apache.solr.client.solrj.cloud.autoscaling.Policy; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.io.SolrClientCache; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.cloud.Overseer; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.cloud.Aliases; +import org.apache.solr.common.cloud.ClusterState; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.DocRouter; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.ZkNodeProps; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.params.CollectionAdminParams; +import org.apache.solr.common.params.CollectionParams; +import org.apache.solr.common.params.CommonAdminParams; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.CoreAdminParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.Utils; +import org.apache.solr.util.TestInjection; +import org.apache.solr.util.TimeOut; +import org.apache.zookeeper.CreateMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Reindex a collection, usually in order to change the index schema. + *

WARNING: Reindexing is potentially a lossy operation - some indexed data that is not available as + * stored fields may be irretrievably lost, so users should use this command with caution, evaluating + * the potential impact by using different source and target collection names first, and preserving + * the source collection until the evaluation is complete.

+ *

Reindexing follows these steps:

+ *
    + *
  1. creates a temporary collection using the most recent schema of the source collection + * (or the one specified in the parameters, which must already exist), and the shape of the original + * collection, unless overridden by parameters.
  2. + *
  3. copy the source documents to the temporary collection, using their stored fields and + * reindexing them using the specified schema. NOTE: some data + * loss may occur if the original stored field data is not available!
  4. + *
  5. create the target collection from scratch with the specified name (or the same as source if not + * specified) and the specified parameters. NOTE: if the target name was not specified or is the same + * as the source collection then a unique sequential collection name will be used.
  6. + *
  7. copy the documents from the source collection to the target collection.
  8. + *
  9. if the source and target collection name was the same then set up an alias pointing from the source collection name to the actual + * (sequentially named) target collection
  10. + *
  11. optionally delete the source collection.
  12. + *
+ */ +public class ReindexCollectionCmd implements OverseerCollectionMessageHandler.Cmd { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public static final String COMMAND = "cmd"; + public static final String REINDEX_STATUS = "reindexStatus"; + public static final String REMOVE_SOURCE = "removeSource"; + public static final String TARGET = "target"; + public static final String TARGET_COL_PREFIX = ".rx_"; + public static final String CHK_COL_PREFIX = ".rx_ck_"; + public static final String REINDEXING_STATE = CollectionAdminRequest.PROPERTY_PREFIX + "rx"; + + public static final String STATE = "state"; + public static final String PHASE = "phase"; + + private static final List COLLECTION_PARAMS = Arrays.asList( + ZkStateReader.CONFIGNAME_PROP, + ZkStateReader.NUM_SHARDS_PROP, + ZkStateReader.NRT_REPLICAS, + ZkStateReader.PULL_REPLICAS, + ZkStateReader.TLOG_REPLICAS, + ZkStateReader.REPLICATION_FACTOR, + ZkStateReader.MAX_SHARDS_PER_NODE, + "shards", + Policy.POLICY, + CollectionAdminParams.CREATE_NODE_SET_PARAM, + CollectionAdminParams.CREATE_NODE_SET_SHUFFLE_PARAM, + ZkStateReader.AUTO_ADD_REPLICAS + ); + + private final OverseerCollectionMessageHandler ocmh; + + private static AtomicInteger tmpCollectionSeq = new AtomicInteger(); + + public enum State { + IDLE, + RUNNING, + ABORTED, + FINISHED; + + public String toLower() { + return toString().toLowerCase(Locale.ROOT); + } + + public static State get(Object p) { + if (p == null) { + return null; + } + p = String.valueOf(p).toLowerCase(Locale.ROOT); + return states.get(p); + } + static Map states = Collections.unmodifiableMap( + Stream.of(State.values()).collect(Collectors.toMap(State::toLower, Function.identity()))); + } + + public enum Cmd { + START, + ABORT, + STATUS; + + public String toLower() { + return toString().toLowerCase(Locale.ROOT); + } + + public static Cmd get(String p) { + if (p == null) { + return null; + } + p = p.toLowerCase(Locale.ROOT); + return cmds.get(p); + } + static Map cmds = Collections.unmodifiableMap( + Stream.of(Cmd.values()).collect(Collectors.toMap(Cmd::toLower, Function.identity()))); + } + + private SolrClientCache solrClientCache; + private String zkHost; + + public ReindexCollectionCmd(OverseerCollectionMessageHandler ocmh) { + this.ocmh = ocmh; + } + + @Override + public void call(ClusterState clusterState, ZkNodeProps message, NamedList results) throws Exception { + + log.debug("*** called: {}", message); + + String collection = message.getStr(CommonParams.NAME); + // before resolving aliases + String originalCollection = collection; + Aliases aliases = ocmh.zkStateReader.getAliases(); + if (collection != null) { + // resolve aliases - the source may be an alias + List aliasList = aliases.resolveAliases(collection); + if (aliasList != null && !aliasList.isEmpty()) { + collection = aliasList.get(0); + } + } + + if (collection == null || !clusterState.hasCollection(collection)) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source collection name must be specified and must exist"); + } + String target = message.getStr(TARGET); + if (target == null) { + target = collection; + } else { + // resolve aliases + List aliasList = aliases.resolveAliases(target); + if (aliasList != null && !aliasList.isEmpty()) { + target = aliasList.get(0); + } + } + boolean sameTarget = target.equals(collection) || target.equals(originalCollection); + boolean removeSource = message.getBool(REMOVE_SOURCE, false); + Cmd command = Cmd.get(message.getStr(COMMAND, Cmd.START.toLower())); + if (command == null) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown command: " + message.getStr(COMMAND)); + } + Map reindexingState = getReindexingState(ocmh.cloudManager.getDistribStateManager(), collection); + if (!reindexingState.containsKey(STATE)) { + reindexingState.put(STATE, State.IDLE.toLower()); + } + State state = State.get(reindexingState.get(STATE)); + if (command == Cmd.ABORT) { + log.info("Abort requested for collection {}, setting the state to ABORTED.", collection); + // check that it's running + if (state != State.RUNNING) { + log.debug("Abort requested for collection {} but command is not running: {}", collection, state); + return; + } + setReindexingState(collection, State.ABORTED, null); + reindexingState.put(STATE, "aborting"); + results.add(REINDEX_STATUS, reindexingState); + // if needed the cleanup will be performed by the running instance of the command + return; + } else if (command == Cmd.STATUS) { + results.add(REINDEX_STATUS, reindexingState); + return; + } + // command == Cmd.START + + // check it's not already running + if (state == State.RUNNING) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Reindex is already running for collection " + collection + + ". If you are sure this is not the case you can issue &cmd=abort to clean up this state."); + } + + DocCollection coll = clusterState.getCollection(collection); + boolean aborted = false; + int batchSize = message.getInt(CommonParams.ROWS, 100); + String query = message.getStr(CommonParams.Q, "*:*"); + String fl = message.getStr(CommonParams.FL, "*"); + Integer rf = message.getInt(ZkStateReader.REPLICATION_FACTOR, coll.getReplicationFactor()); + Integer numNrt = message.getInt(ZkStateReader.NRT_REPLICAS, coll.getNumNrtReplicas()); + Integer numTlog = message.getInt(ZkStateReader.TLOG_REPLICAS, coll.getNumTlogReplicas()); + Integer numPull = message.getInt(ZkStateReader.PULL_REPLICAS, coll.getNumPullReplicas()); + int numShards = message.getInt(ZkStateReader.NUM_SHARDS_PROP, coll.getActiveSlices().size()); + int maxShardsPerNode = message.getInt(ZkStateReader.MAX_SHARDS_PER_NODE, coll.getMaxShardsPerNode()); + DocRouter router = coll.getRouter(); + if (router == null) { + router = DocRouter.DEFAULT; + } + + String configName = message.getStr(ZkStateReader.CONFIGNAME_PROP, ocmh.zkStateReader.readConfigName(collection)); + String targetCollection; + int seq = tmpCollectionSeq.getAndIncrement(); + if (sameTarget) { + do { + targetCollection = TARGET_COL_PREFIX + originalCollection + "_" + seq; + if (!clusterState.hasCollection(targetCollection)) { + break; + } + seq = tmpCollectionSeq.getAndIncrement(); + } while (clusterState.hasCollection(targetCollection)); + } else { + targetCollection = target; + } + String chkCollection = CHK_COL_PREFIX + originalCollection; + String daemonUrl = null; + Exception exc = null; + boolean createdTarget = false; + try { + solrClientCache = new SolrClientCache(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient()); + zkHost = ocmh.zkStateReader.getZkClient().getZkServerAddress(); + // set the running flag + reindexingState.clear(); + reindexingState.put("actualSourceCollection", collection); + reindexingState.put("actualTargetCollection", targetCollection); + reindexingState.put("checkpointCollection", chkCollection); + reindexingState.put("inputDocs", getNumberOfDocs(collection)); + reindexingState.put(PHASE, "creating target and checkpoint collections"); + setReindexingState(collection, State.RUNNING, reindexingState); + + // 0. set up target and checkpoint collections + NamedList cmdResults = new NamedList<>(); + ZkNodeProps cmd; + if (clusterState.hasCollection(targetCollection)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Target collection " + targetCollection + " already exists! Delete it first."); + } + if (clusterState.hasCollection(chkCollection)) { + // delete the checkpoint collection + cmd = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.DELETE.toLower(), + CommonParams.NAME, chkCollection, + CoreAdminParams.DELETE_METRICS_HISTORY, "true" + ); + ocmh.commandMap.get(CollectionParams.CollectionAction.DELETE).call(clusterState, cmd, cmdResults); + checkResults("deleting old checkpoint collection " + chkCollection, cmdResults, true); + } + + if (maybeAbort(collection)) { + aborted = true; + return; + } + + Map propMap = new HashMap<>(); + propMap.put(Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.CREATE.toLower()); + propMap.put(CommonParams.NAME, targetCollection); + propMap.put(ZkStateReader.NUM_SHARDS_PROP, numShards); + propMap.put(CollectionAdminParams.COLL_CONF, configName); + // init first from the same router + propMap.put("router.name", router.getName()); + for (String key : coll.keySet()) { + if (key.startsWith("router.")) { + propMap.put(key, coll.get(key)); + } + } + // then apply overrides if present + for (String key : message.keySet()) { + if (key.startsWith("router.")) { + propMap.put(key, message.getStr(key)); + } else if (COLLECTION_PARAMS.contains(key)) { + propMap.put(key, message.get(key)); + } + } + + propMap.put(ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode); + propMap.put(CommonAdminParams.WAIT_FOR_FINAL_STATE, true); + propMap.put(DocCollection.STATE_FORMAT, message.getInt(DocCollection.STATE_FORMAT, coll.getStateFormat())); + if (rf != null) { + propMap.put(ZkStateReader.REPLICATION_FACTOR, rf); + } + if (numNrt != null) { + propMap.put(ZkStateReader.NRT_REPLICAS, numNrt); + } + if (numTlog != null) { + propMap.put(ZkStateReader.TLOG_REPLICAS, numTlog); + } + if (numPull != null) { + propMap.put(ZkStateReader.PULL_REPLICAS, numPull); + } + // create the target collection + cmd = new ZkNodeProps(propMap); + cmdResults = new NamedList<>(); + ocmh.commandMap.get(CollectionParams.CollectionAction.CREATE).call(clusterState, cmd, cmdResults); + createdTarget = true; + checkResults("creating target collection " + targetCollection, cmdResults, true); + + // create the checkpoint collection - use RF=1 and 1 shard + cmd = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.CREATE.toLower(), + CommonParams.NAME, chkCollection, + ZkStateReader.NUM_SHARDS_PROP, "1", + ZkStateReader.REPLICATION_FACTOR, "1", + DocCollection.STATE_FORMAT, "2", + CollectionAdminParams.COLL_CONF, "_default", + CommonAdminParams.WAIT_FOR_FINAL_STATE, "true" + ); + cmdResults = new NamedList<>(); + ocmh.commandMap.get(CollectionParams.CollectionAction.CREATE).call(clusterState, cmd, cmdResults); + checkResults("creating checkpoint collection " + chkCollection, cmdResults, true); + // wait for a while until we see both collections + TimeOut waitUntil = new TimeOut(30, TimeUnit.SECONDS, ocmh.timeSource); + boolean created = false; + while (!waitUntil.hasTimedOut()) { + waitUntil.sleep(100); + // this also refreshes our local var clusterState + clusterState = ocmh.cloudManager.getClusterStateProvider().getClusterState(); + created = clusterState.hasCollection(targetCollection) && clusterState.hasCollection(chkCollection); + if (created) break; + } + if (!created) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not fully create temporary collection(s)"); + } + if (maybeAbort(collection)) { + aborted = true; + return; + } + + // 1. put the source collection in read-only mode + cmd = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.MODIFYCOLLECTION.toLower(), + ZkStateReader.COLLECTION_PROP, collection, + ZkStateReader.READ_ONLY, "true"); + ocmh.overseer.offerStateUpdate(Utils.toJSON(cmd)); + + TestInjection.injectReindexLatch(); + + if (maybeAbort(collection)) { + aborted = true; + return; + } + + // 2. copy the documents to target + // Recipe taken from: http://joelsolr.blogspot.com/2016/10/solr-63-batch-jobs-parallel-etl-and.html + ModifiableSolrParams q = new ModifiableSolrParams(); + q.set(CommonParams.QT, "/stream"); + q.set("collection", collection); + q.set("expr", + "daemon(id=\"" + targetCollection + "\"," + + "terminate=\"true\"," + + "commit(" + targetCollection + "," + + "update(" + targetCollection + "," + + "batchSize=" + batchSize + "," + + "topic(" + chkCollection + "," + + collection + "," + + "q=\"" + query + "\"," + + "fl=\"" + fl + "\"," + + "id=\"topic_" + targetCollection + "\"," + + // some of the documents eg. in .system contain large blobs + "rows=\"" + batchSize + "\"," + + "initialCheckpoint=\"0\"))))"); + log.debug("- starting copying documents from " + collection + " to " + targetCollection); + SolrResponse rsp = null; + try { + rsp = ocmh.cloudManager.request(new QueryRequest(q)); + } catch (Exception e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to copy documents from " + + collection + " to " + targetCollection, e); + } + daemonUrl = getDaemonUrl(rsp, coll); + if (daemonUrl == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to copy documents from " + + collection + " to " + targetCollection + ": " + Utils.toJSONString(rsp)); + } + reindexingState.put("daemonUrl", daemonUrl); + reindexingState.put("daemonName", targetCollection); + reindexingState.put(PHASE, "copying documents"); + setReindexingState(collection, State.RUNNING, reindexingState); + + // wait for the daemon to finish + waitForDaemon(targetCollection, daemonUrl, collection, targetCollection, reindexingState); + if (maybeAbort(collection)) { + aborted = true; + return; + } + log.debug("- finished copying from " + collection + " to " + targetCollection); + // fail here or earlier during daemon run + TestInjection.injectReindexFailure(); + + // 5. if (sameTarget) set up an alias to use targetCollection as the source name + if (sameTarget) { + log.debug("- setting up alias from " + originalCollection + " to " + targetCollection); + cmd = new ZkNodeProps( + CommonParams.NAME, originalCollection, + "collections", targetCollection); + cmdResults = new NamedList<>(); + ocmh.commandMap.get(CollectionParams.CollectionAction.CREATEALIAS).call(clusterState, cmd, results); + checkResults("setting up alias " + originalCollection + " -> " + targetCollection, cmdResults, true); + reindexingState.put("alias", originalCollection + " -> " + targetCollection); + } + + reindexingState.remove("daemonUrl"); + reindexingState.remove("daemonName"); + reindexingState.put("processedDocs", getNumberOfDocs(targetCollection)); + reindexingState.put(PHASE, "copying done, finalizing"); + setReindexingState(collection, State.RUNNING, reindexingState); + + if (maybeAbort(collection)) { + aborted = true; + return; + } + // 6. delete the checkpoint collection + log.debug("- deleting " + chkCollection); + cmd = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.DELETE.toLower(), + CommonParams.NAME, chkCollection, + CoreAdminParams.DELETE_METRICS_HISTORY, "true" + ); + cmdResults = new NamedList<>(); + ocmh.commandMap.get(CollectionParams.CollectionAction.DELETE).call(clusterState, cmd, cmdResults); + checkResults("deleting checkpoint collection " + chkCollection, cmdResults, true); + + // 7. optionally delete the source collection + if (removeSource) { + log.debug("- deleting source collection"); + cmd = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.DELETE.toLower(), + CommonParams.NAME, collection, + CoreAdminParams.DELETE_METRICS_HISTORY, "true" + ); + cmdResults = new NamedList<>(); + ocmh.commandMap.get(CollectionParams.CollectionAction.DELETE).call(clusterState, cmd, cmdResults); + checkResults("deleting source collection " + collection, cmdResults, true); + } else { + // 8. clear readOnly on source + ZkNodeProps props = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.MODIFYCOLLECTION.toLower(), + ZkStateReader.COLLECTION_PROP, collection, + ZkStateReader.READ_ONLY, null); + ocmh.overseer.offerStateUpdate(Utils.toJSON(props)); + } + // 9. set FINISHED state on the target and clear the state on the source + ZkNodeProps props = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.MODIFYCOLLECTION.toLower(), + ZkStateReader.COLLECTION_PROP, targetCollection, + REINDEXING_STATE, State.FINISHED.toLower()); + ocmh.overseer.offerStateUpdate(Utils.toJSON(props)); + + reindexingState.put(STATE, State.FINISHED.toLower()); + reindexingState.put(PHASE, "done"); + removeReindexingState(collection); + } catch (Exception e) { + log.warn("Error during reindexing of " + originalCollection, e); + exc = e; + aborted = true; + } finally { + solrClientCache.close(); + if (aborted) { + cleanup(collection, targetCollection, chkCollection, daemonUrl, targetCollection, createdTarget); + if (exc != null) { + results.add("error", exc.toString()); + } + reindexingState.put(STATE, State.ABORTED.toLower()); + } + results.add(REINDEX_STATUS, reindexingState); + } + } + + private static final String REINDEXING_STATE_PATH = "/.reindexing"; + + private Map setReindexingState(String collection, State state, Map props) throws Exception { + String path = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + REINDEXING_STATE_PATH; + DistribStateManager stateManager = ocmh.cloudManager.getDistribStateManager(); + Map copyProps = new HashMap<>(); + if (props == null) { // retrieve existing props, if any + props = Utils.getJson(stateManager, path); + } + copyProps.putAll(props); + copyProps.put("state", state.toLower()); + if (stateManager.hasData(path)) { + stateManager.setData(path, Utils.toJSON(copyProps), -1); + } else { + stateManager.makePath(path, Utils.toJSON(copyProps), CreateMode.PERSISTENT, false); + } + return copyProps; + } + + private void removeReindexingState(String collection) throws Exception { + String path = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + REINDEXING_STATE_PATH; + DistribStateManager stateManager = ocmh.cloudManager.getDistribStateManager(); + if (stateManager.hasData(path)) { + stateManager.removeData(path, -1); + } + } + + @VisibleForTesting + public static Map getReindexingState(DistribStateManager stateManager, String collection) throws Exception { + String path = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + REINDEXING_STATE_PATH; + // make it modifiable + return new TreeMap<>(Utils.getJson(stateManager, path)); + } + + private long getNumberOfDocs(String collection) { + CloudSolrClient solrClient = solrClientCache.getCloudSolrClient(zkHost); + try { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(CommonParams.Q, "*:*"); + params.add(CommonParams.ROWS, "0"); + QueryResponse rsp = solrClient.query(collection, params); + return rsp.getResults().getNumFound(); + } catch (Exception e) { + return 0L; + } + } + + private void checkResults(String label, NamedList results, boolean failureIsFatal) throws Exception { + Object failure = results.get("failure"); + if (failure == null) { + failure = results.get("error"); + } + if (failure != null) { + String msg = "Error: " + label + ": " + Utils.toJSONString(results); + if (failureIsFatal) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, msg); + } else { + log.error(msg); + } + } + } + + private boolean maybeAbort(String collection) throws Exception { + DocCollection coll = ocmh.cloudManager.getClusterStateProvider().getClusterState().getCollectionOrNull(collection); + if (coll == null) { + // collection no longer present - abort + log.info("## Aborting - collection {} no longer present.", collection); + return true; + } + Map reindexingState = getReindexingState(ocmh.cloudManager.getDistribStateManager(), collection); + State state = State.get(reindexingState.getOrDefault(STATE, State.RUNNING.toLower())); + if (state != State.ABORTED) { + return false; + } + log.info("## Aborting - collection {} state is {}", collection, state); + return true; + } + + // XXX see #waitForDaemon() for why we need this + private String getDaemonUrl(SolrResponse rsp, DocCollection coll) { + Map rs = (Map)rsp.getResponse().get("result-set"); + if (rs == null || rs.isEmpty()) { + log.debug(" -- Missing daemon information in response: " + Utils.toJSONString(rsp)); + } + List list = (List)rs.get("docs"); + if (list == null) { + log.debug(" -- Missing daemon information in response: " + Utils.toJSONString(rsp)); + return null; + } + String replicaName = null; + for (Object o : list) { + Map map = (Map)o; + String op = (String)map.get("DaemonOp"); + if (op == null) { + continue; + } + String[] parts = op.split("\\s+"); + if (parts.length != 4) { + log.debug(" -- Invalid daemon location info, expected 4 tokens: " + op); + return null; + } + // check if it's plausible + if (parts[3].contains("shard") && parts[3].contains("replica")) { + replicaName = parts[3]; + break; + } else { + log.debug(" -- daemon location info likely invalid: " + op); + return null; + } + } + if (replicaName == null) { + return null; + } + // build a baseUrl of the replica + for (Replica r : coll.getReplicas()) { + if (replicaName.equals(r.getCoreName())) { + return r.getBaseUrl() + "/" + r.getCoreName(); + } + } + return null; + } + + // XXX currently this is complicated to due a bug in the way the daemon 'list' + // XXX operation is implemented - see SOLR-13245. We need to query the actual + // XXX SolrCore where the daemon is running + private void waitForDaemon(String daemonName, String daemonUrl, String sourceCollection, String targetCollection, Map reindexingState) throws Exception { + HttpClient client = ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient(); + try (HttpSolrClient solrClient = new HttpSolrClient.Builder() + .withHttpClient(client) + .withBaseSolrUrl(daemonUrl).build()) { + ModifiableSolrParams q = new ModifiableSolrParams(); + q.set(CommonParams.QT, "/stream"); + q.set("action", "list"); + q.set(CommonParams.DISTRIB, false); + QueryRequest req = new QueryRequest(q); + boolean isRunning; + int statusCheck = 0; + do { + isRunning = false; + statusCheck++; + try { + NamedList rsp = solrClient.request(req); + Map rs = (Map)rsp.get("result-set"); + if (rs == null || rs.isEmpty()) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Can't find daemon list: missing result-set: " + Utils.toJSONString(rsp)); + } + List list = (List)rs.get("docs"); + if (list == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Can't find daemon list: missing result-set: " + Utils.toJSONString(rsp)); + } + if (list.isEmpty()) { // finished? + break; + } + for (Object o : list) { + Map map = (Map)o; + String id = (String)map.get("id"); + if (daemonName.equals(id)) { + isRunning = true; + // fail here + TestInjection.injectReindexFailure(); + break; + } + } + } catch (Exception e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception waiting for daemon " + + daemonName + " at " + daemonUrl, e); + } + if (statusCheck % 5 == 0) { + reindexingState.put("processedDocs", getNumberOfDocs(targetCollection)); + setReindexingState(sourceCollection, State.RUNNING, reindexingState); + } + ocmh.cloudManager.getTimeSource().sleep(2000); + } while (isRunning && !maybeAbort(sourceCollection)); + } + } + + private void killDaemon(String daemonName, String daemonUrl) throws Exception { + log.debug("-- killing daemon " + daemonName + " at " + daemonUrl); + HttpClient client = ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient(); + try (HttpSolrClient solrClient = new HttpSolrClient.Builder() + .withHttpClient(client) + .withBaseSolrUrl(daemonUrl).build()) { + ModifiableSolrParams q = new ModifiableSolrParams(); + q.set(CommonParams.QT, "/stream"); + // we should really use 'kill' here, but then we will never + // know when the daemon actually finishes running - 'kill' only + // sets a flag that may be noticed much later + q.set("action", "stop"); + q.set(CommonParams.ID, daemonName); + q.set(CommonParams.DISTRIB, false); + QueryRequest req = new QueryRequest(q); + NamedList rsp = solrClient.request(req); + // /result-set/docs/[0]/DaemonOp : Deamon:id killed on coreName + log.debug(" -- stop daemon response: " + Utils.toJSONString(rsp)); + Map rs = (Map) rsp.get("result-set"); + if (rs == null || rs.isEmpty()) { + log.warn("Problem killing daemon " + daemonName + ": missing result-set: " + Utils.toJSONString(rsp)); + return; + } + List list = (List) rs.get("docs"); + if (list == null) { + log.warn("Problem killing daemon " + daemonName + ": missing result-set: " + Utils.toJSONString(rsp)); + return; + } + if (list.isEmpty()) { // already finished? + return; + } + for (Object o : list) { + Map map = (Map) o; + String op = (String) map.get("DaemonOp"); + if (op == null) { + continue; + } + if (op.contains(daemonName) && op.contains("stopped")) { + // now wait for the daemon to really stop + q.set("action", "list"); + req = new QueryRequest(q); + TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS, ocmh.timeSource); + while (!timeOut.hasTimedOut()) { + rsp = solrClient.request(req); + rs = (Map) rsp.get("result-set"); + if (rs == null || rs.isEmpty()) { + log.warn("Problem killing daemon " + daemonName + ": missing result-set: " + Utils.toJSONString(rsp)); + break; + } + List list2 = (List) rs.get("docs"); + if (list2 == null) { + log.warn("Problem killing daemon " + daemonName + ": missing result-set: " + Utils.toJSONString(rsp)); + break; + } + if (list2.isEmpty()) { // already finished? + break; + } + Map status2 = null; + for (Object o2 : list2) { + Map map2 = (Map)o2; + if (daemonName.equals(map2.get("id"))) { + status2 = map2; + break; + } + } + if (status2 == null) { // finished? + break; + } + Number stopTime = (Number)status2.get("stopTime"); + if (stopTime.longValue() > 0) { + break; + } + } + if (timeOut.hasTimedOut()) { + log.warn("Problem killing daemon " + daemonName + ": timed out waiting for daemon to stop."); + // proceed anyway + } + } + } + // now kill it - it's already stopped, this simply removes its status + q.set("action", "kill"); + req = new QueryRequest(q); + solrClient.request(req); + } + } + + private void cleanup(String collection, String targetCollection, String chkCollection, + String daemonUrl, String daemonName, boolean createdTarget) throws Exception { + log.info("## Cleaning up after abort or error"); + // 1. kill the daemon + // 2. cleanup target / chk collections IFF the source collection still exists and is not empty + // 3. cleanup collection state + + if (daemonUrl != null) { + killDaemon(daemonName, daemonUrl); + } + ClusterState clusterState = ocmh.cloudManager.getClusterStateProvider().getClusterState(); + NamedList cmdResults = new NamedList<>(); + if (createdTarget && !collection.equals(targetCollection) && clusterState.hasCollection(targetCollection)) { + log.debug(" -- removing " + targetCollection); + ZkNodeProps cmd = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.DELETE.toLower(), + CommonParams.NAME, targetCollection, + CoreAdminParams.DELETE_METRICS_HISTORY, "true" + ); + ocmh.commandMap.get(CollectionParams.CollectionAction.DELETE).call(clusterState, cmd, cmdResults); + checkResults("CLEANUP: deleting target collection " + targetCollection, cmdResults, false); + + } + // remove chk collection + if (clusterState.hasCollection(chkCollection)) { + log.debug(" -- removing " + chkCollection); + ZkNodeProps cmd = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.DELETE.toLower(), + CommonParams.NAME, chkCollection, + CoreAdminParams.DELETE_METRICS_HISTORY, "true" + ); + cmdResults = new NamedList<>(); + ocmh.commandMap.get(CollectionParams.CollectionAction.DELETE).call(clusterState, cmd, cmdResults); + checkResults("CLEANUP: deleting checkpoint collection " + chkCollection, cmdResults, false); + } + log.debug(" -- turning readOnly mode off for " + collection); + ZkNodeProps props = new ZkNodeProps( + Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.MODIFYCOLLECTION.toLower(), + ZkStateReader.COLLECTION_PROP, collection, + ZkStateReader.READ_ONLY, null); + ocmh.overseer.offerStateUpdate(Utils.toJSON(props)); + removeReindexingState(collection); + } +} diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index 410e26efc4..c7466282e8 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -748,10 +748,16 @@ public class CoreContainer { containerHandlers.put(AutoScalingHandler.HANDLER_PATH, autoScalingHandler); autoScalingHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, AutoScalingHandler.HANDLER_PATH); } + // verify .system compatibility + systemCollCompatCheck(); // This is a bit redundant but these are two distinct concepts for all they're accomplished at the same time. status |= LOAD_COMPLETE | INITIAL_CORE_LOAD_COMPLETE; } + private void systemCollCompatCheck() { + + } + // MetricsHistoryHandler supports both cloud and standalone configs private void createMetricsHistoryHandler() { PluginInfo plugin = cfg.getMetricsConfig().getHistoryHandler(); diff --git a/solr/core/src/java/org/apache/solr/handler/StreamHandler.java b/solr/core/src/java/org/apache/solr/handler/StreamHandler.java index a447093f73..545e2b305c 100644 --- a/solr/core/src/java/org/apache/solr/handler/StreamHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/StreamHandler.java @@ -216,8 +216,10 @@ public class StreamHandler extends RequestHandlerBase implements SolrCoreAware, DaemonStream d = daemons.remove(id); if (d != null) { d.close(); + rsp.add("result-set", new DaemonResponseStream("Deamon:" + id + " killed on " + coreName)); + } else { + rsp.add("result-set", new DaemonResponseStream("Deamon:" + id + " not found on " + coreName)); } - rsp.add("result-set", new DaemonResponseStream("Deamon:" + id + " killed on " + coreName)); } } } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index 70bcd1ac13..e933ac35d7 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -53,6 +53,7 @@ import org.apache.solr.cloud.OverseerTaskQueue.QueueEvent; import org.apache.solr.cloud.ZkController.NotInClusterStateException; import org.apache.solr.cloud.ZkController; import org.apache.solr.cloud.ZkShardTerms; +import org.apache.solr.cloud.api.collections.ReindexCollectionCmd; import org.apache.solr.cloud.api.collections.RoutedAlias; import org.apache.solr.cloud.overseer.SliceMutator; import org.apache.solr.cloud.rule.ReplicaAssigner; @@ -77,6 +78,7 @@ import org.apache.solr.common.params.AutoScalingParams; import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.params.CollectionParams.CollectionAction; +import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; @@ -540,6 +542,35 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission RELOAD_OP(RELOAD, (req, rsp, h) -> copy(req.getParams().required(), null, NAME)), + REINDEXCOLLECTION_OP(REINDEXCOLLECTION, (req, rsp, h) -> { + Map m = copy(req.getParams().required(), null, NAME); + copy(req.getParams(), m, + ReindexCollectionCmd.COMMAND, + ReindexCollectionCmd.REMOVE_SOURCE, + ReindexCollectionCmd.TARGET, + ZkStateReader.CONFIGNAME_PROP, + NUM_SLICES, + NRT_REPLICAS, + PULL_REPLICAS, + TLOG_REPLICAS, + REPLICATION_FACTOR, + MAX_SHARDS_PER_NODE, + POLICY, + CREATE_NODE_SET, + CREATE_NODE_SET_SHUFFLE, + AUTO_ADD_REPLICAS, + "shards", + STATE_FORMAT, + CommonParams.ROWS, + CommonParams.Q, + CommonParams.FL); + if (req.getParams().get("collection." + ZkStateReader.CONFIGNAME_PROP) != null) { + m.put(ZkStateReader.CONFIGNAME_PROP, req.getParams().get("collection." + ZkStateReader.CONFIGNAME_PROP)); + } + copyPropertiesWithPrefix(req.getParams(), m, "router."); + return m; + }), + SYNCSHARD_OP(SYNCSHARD, (req, rsp, h) -> { String collection = req.getParams().required().get("collection"); String shard = req.getParams().required().get("shard"); diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java index cf7681e63a..7a49ba496e 100644 --- a/solr/core/src/java/org/apache/solr/util/TestInjection.java +++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java @@ -126,6 +126,10 @@ public class TestInjection { public volatile static CountDownLatch splitLatch = null; + public volatile static CountDownLatch reindexLatch = null; + + public volatile static String reindexFailure = null; + public volatile static String failIndexFingerprintRequests = null; public volatile static String wrongIndexFingerprint = null; @@ -156,6 +160,8 @@ public class TestInjection { splitFailureBeforeReplicaCreation = null; splitFailureAfterReplicaCreation = null; splitLatch = null; + reindexLatch = null; + reindexFailure = null; prepRecoveryOpPauseForever = null; countPrepRecoveryOpPauseForever = new AtomicInteger(0); failIndexFingerprintRequests = null; @@ -423,6 +429,35 @@ public class TestInjection { return true; } + public static boolean injectReindexFailure() { + if (reindexFailure != null) { + Random rand = random(); + if (null == rand) return true; + + Pair pair = parseValue(reindexFailure); + boolean enabled = pair.first(); + int chanceIn100 = pair.second(); + if (enabled && rand.nextInt(100) >= (100 - chanceIn100)) { + log.info("Test injection failure"); + throw new SolrException(ErrorCode.SERVER_ERROR, "Test injection failure"); + } + } + return true; + } + + + public static boolean injectReindexLatch() { + if (reindexLatch != null) { + try { + log.info("Waiting in ReindexCollectionCmd for up to 60s"); + return reindexLatch.await(60, TimeUnit.SECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + return true; + } + private static Pair parseValue(final String raw) { if (raw == null) return new Pair<>(false, 0); Matcher m = ENABLED_PERCENT.matcher(raw); diff --git a/solr/core/src/test/org/apache/solr/cloud/ReindexCollectionTest.java b/solr/core/src/test/org/apache/solr/cloud/ReindexCollectionTest.java new file mode 100644 index 0000000000..a9f9a22d5c --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/ReindexCollectionTest.java @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.cloud; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; + +import org.apache.solr.client.solrj.cloud.DistribStateManager; +import org.apache.solr.client.solrj.cloud.SolrCloudManager; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.response.CollectionAdminResponse; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.cloud.api.collections.ReindexCollectionCmd; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.cloud.ClusterState; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.ImplicitDocRouter; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.util.LogLevel; +import org.apache.solr.util.TestInjection; +import org.apache.solr.util.TimeOut; +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * + */ +@LogLevel("org.apache.solr.cloud.api.collections.ReindexCollectionCmd=DEBUG") +public class ReindexCollectionTest extends SolrCloudTestCase { + + @BeforeClass + public static void setupCluster() throws Exception { + configureCluster(2) + // only *_s + .addConfig("conf1", configset("cloud-minimal")) + // every combination of field flags + .addConfig("conf2", configset("cloud-dynamic")) + // catch-all * field, indexed+stored + .addConfig("conf3", configset("cloud-minimal-inplace-updates")) + .configure(); + } + + private CloudSolrClient solrClient; + private SolrCloudManager cloudManager; + private DistribStateManager stateManager; + + @Before + public void doBefore() throws Exception { + ZkController zkController = cluster.getJettySolrRunner(0).getCoreContainer().getZkController(); + cloudManager = zkController.getSolrCloudManager(); + stateManager = cloudManager.getDistribStateManager(); + solrClient = new CloudSolrClientBuilder(Collections.singletonList(zkController.getZkServerAddress()), + Optional.empty()).build(); + } + + private ReindexCollectionCmd.State getState(String collection) { + try { + return ReindexCollectionCmd.State.get(ReindexCollectionCmd + .getReindexingState(stateManager, collection) + .get(ReindexCollectionCmd.STATE)); + } catch (Exception e) { + fail("Unexpected exception checking state of " + collection + ": " + e); + return null; + } + } + + private void waitForState(String collection, ReindexCollectionCmd.State expected) throws Exception { + TimeOut timeOut = new TimeOut(30, TimeUnit.SECONDS, cloudManager.getTimeSource()); + ReindexCollectionCmd.State current = null; + while (!timeOut.hasTimedOut()) { + current = getState(collection); + if (expected == current) { + return; + } + timeOut.sleep(500); + } + throw new Exception("timeout waiting for state, current=" + current + ", expected=" + expected); + } + + @After + public void doAfter() throws Exception { + cluster.deleteAllCollections(); // deletes aliases too + + solrClient.close(); + + TestInjection.reset(); + } + + private static final int NUM_DOCS = 200; // at least two batches, default batchSize=100 + + @Test + public void testBasicReindexing() throws Exception { + final String sourceCollection = "basicReindexing"; + + createCollection(sourceCollection, "conf1", 2, 2); + + indexDocs(sourceCollection, NUM_DOCS, + i -> new SolrInputDocument("id", String.valueOf(i), "string_s", String.valueOf(i))); + + final String targetCollection = "basicReindexingTarget"; + + CollectionAdminRequest.ReindexCollection req = CollectionAdminRequest.reindexCollection(sourceCollection) + .setTarget(targetCollection); + CollectionAdminResponse rsp = req.process(solrClient); + assertNotNull(rsp.toString(), rsp.getResponse().get(ReindexCollectionCmd.REINDEX_STATUS)); + Map status = (Map)rsp.getResponse().get(ReindexCollectionCmd.REINDEX_STATUS); + assertEquals(status.toString(), (long)NUM_DOCS, ((Number)status.get("inputDocs")).longValue()); + assertEquals(status.toString(), (long)NUM_DOCS, ((Number)status.get("processedDocs")).longValue()); + + CloudTestUtils.waitForState(cloudManager, "did not finish copying in time", targetCollection, (liveNodes, coll) -> { + ReindexCollectionCmd.State state = ReindexCollectionCmd.State.get(coll.getStr(ReindexCollectionCmd.REINDEXING_STATE)); + return ReindexCollectionCmd.State.FINISHED == state; + }); + // verify the target docs exist + QueryResponse queryResponse = solrClient.query(targetCollection, params(CommonParams.Q, "*:*")); + assertEquals("copied num docs", NUM_DOCS, queryResponse.getResults().getNumFound()); + } + + public void testSameTargetReindexing() throws Exception { + final String sourceCollection = "sameTargetReindexing"; + final String targetCollection = sourceCollection; + + createCollection(sourceCollection, "conf1", 2, 2); + indexDocs(sourceCollection, NUM_DOCS, + i -> new SolrInputDocument("id", String.valueOf(i), "string_s", String.valueOf(i))); + + CollectionAdminRequest.ReindexCollection req = CollectionAdminRequest.reindexCollection(sourceCollection) + .setTarget(targetCollection); + req.process(solrClient); + + String realTargetCollection = null; + TimeOut timeOut = new TimeOut(30, TimeUnit.SECONDS, cloudManager.getTimeSource()); + String prefix = ReindexCollectionCmd.TARGET_COL_PREFIX + targetCollection; + while (!timeOut.hasTimedOut()) { + timeOut.sleep(500); + for (String name : cloudManager.getClusterStateProvider().getClusterState().getCollectionsMap().keySet()) { + if (name.startsWith(prefix)) { + realTargetCollection = name; + break; + } + } + if (realTargetCollection != null) { + break; + } + } + assertNotNull("target collection not present after 30s", realTargetCollection); + + CloudTestUtils.waitForState(cloudManager, "did not finish copying in time", realTargetCollection, (liveNodes, coll) -> { + ReindexCollectionCmd.State state = ReindexCollectionCmd.State.get(coll.getStr(ReindexCollectionCmd.REINDEXING_STATE)); + return ReindexCollectionCmd.State.FINISHED == state; + }); + // verify the target docs exist + QueryResponse rsp = solrClient.query(targetCollection, params(CommonParams.Q, "*:*")); + assertEquals("copied num docs", NUM_DOCS, rsp.getResults().getNumFound()); + } + + @Test + public void testLossySchema() throws Exception { + final String sourceCollection = "sourceLossyReindexing"; + final String targetCollection = "targetLossyReindexing"; + + + createCollection(sourceCollection, "conf2", 2, 2); + + indexDocs(sourceCollection, NUM_DOCS, i -> + new SolrInputDocument( + "id", String.valueOf(i), + "string_s", String.valueOf(i), + "sind", "this is a test " + i)); // "sind": indexed=true, stored=false, will be lost... + + CollectionAdminRequest.ReindexCollection req = CollectionAdminRequest.reindexCollection(sourceCollection) + .setTarget(targetCollection) + .setConfigName("conf3"); + req.process(solrClient); + + CloudTestUtils.waitForState(cloudManager, "did not finish copying in time", targetCollection, (liveNodes, coll) -> { + ReindexCollectionCmd.State state = ReindexCollectionCmd.State.get(coll.getStr(ReindexCollectionCmd.REINDEXING_STATE)); + return ReindexCollectionCmd.State.FINISHED == state; + }); + // verify the target docs exist + QueryResponse rsp = solrClient.query(targetCollection, params(CommonParams.Q, "*:*")); + assertEquals("copied num docs", NUM_DOCS, rsp.getResults().getNumFound()); + for (SolrDocument doc : rsp.getResults()) { + String id = (String)doc.getFieldValue("id"); + assertEquals(id, doc.getFieldValue("string_s")); + assertFalse(doc.containsKey("sind")); // lost in translation ... + } + } + + @Test + public void testReshapeReindexing() throws Exception { + final String sourceCollection = "reshapeReindexing"; + final String targetCollection = "reshapeReindexingTarget"; + createCollection(sourceCollection, "conf1", 2, 2); + indexDocs(sourceCollection, NUM_DOCS, + i -> new SolrInputDocument( + "id", String.valueOf(i), + "string_s", String.valueOf(i), + "remove_s", String.valueOf(i))); + + CollectionAdminRequest.ReindexCollection req = CollectionAdminRequest.reindexCollection(sourceCollection) + .setTarget(targetCollection) + .setCollectionParam(ZkStateReader.NUM_SHARDS_PROP, 3) + .setCollectionParam(ZkStateReader.REPLICATION_FACTOR, 1) + .setCollectionParam("router.name", ImplicitDocRouter.NAME) + .setCollectionParam("shards", "foo,bar,baz") + .setCollectionParam("fl", "id,string_s") + .setCollectionParam("q", "id:10*"); + req.process(solrClient); + + CloudTestUtils.waitForState(cloudManager, "did not finish copying in time", targetCollection, (liveNodes, coll) -> { + ReindexCollectionCmd.State state = ReindexCollectionCmd.State.get(coll.getStr(ReindexCollectionCmd.REINDEXING_STATE)); + return ReindexCollectionCmd.State.FINISHED == state; + }); + // verify the target docs exist + QueryResponse rsp = solrClient.query(targetCollection, params(CommonParams.Q, "*:*")); + // 10 and 100-109 + assertEquals("copied num docs", 11, rsp.getResults().getNumFound()); + // verify the correct fields exist + for (SolrDocument doc : rsp.getResults()) { + assertNotNull(doc.getFieldValue("id")); + assertNotNull(doc.getFieldValue("string_s")); + assertNull(doc.getFieldValue("remove_s")); + } + + // check the shape of the new collection + ClusterState clusterState = solrClient.getClusterStateProvider().getClusterState(); + List aliases = solrClient.getZkStateReader().getAliases().resolveAliases(targetCollection); + assertFalse(aliases.isEmpty()); + String realTargetCollection = aliases.get(0); + DocCollection coll = clusterState.getCollection(realTargetCollection); + assertNotNull(coll); + assertEquals(3, coll.getSlices().size()); + assertNotNull("foo", coll.getSlice("foo")); + assertNotNull("bar", coll.getSlice("bar")); + assertNotNull("baz", coll.getSlice("baz")); + assertEquals(new Integer(1), coll.getReplicationFactor()); + assertEquals(ImplicitDocRouter.NAME, coll.getRouter().getName()); + } + + @Test + public void testFailure() throws Exception { + final String sourceCollection = "failReindexing"; + final String targetCollection = "failReindexingTarget"; + final String aliasTarget = "failAlias"; + createCollection(sourceCollection, "conf1", 2, 2); + createCollection(targetCollection, "conf1", 1, 1); + CollectionAdminRequest.createAlias(aliasTarget, targetCollection).process(solrClient); + indexDocs(sourceCollection, NUM_DOCS, + i -> new SolrInputDocument( + "id", String.valueOf(i), + "string_s", String.valueOf(i))); + + CollectionAdminRequest.ReindexCollection req = CollectionAdminRequest.reindexCollection(sourceCollection) + .setTarget(targetCollection); + CollectionAdminResponse rsp = req.process(solrClient); + assertNotNull(rsp.getResponse().get("error")); + assertTrue(rsp.toString(), rsp.getResponse().get("error").toString().contains("already exists")); + + req = CollectionAdminRequest.reindexCollection(sourceCollection) + .setTarget(aliasTarget); + rsp = req.process(solrClient); + assertNotNull(rsp.getResponse().get("error")); + assertTrue(rsp.toString(), rsp.getResponse().get("error").toString().contains("already exists")); + + CollectionAdminRequest.deleteAlias(aliasTarget).process(solrClient); + CollectionAdminRequest.deleteCollection(targetCollection).process(solrClient); + + req = CollectionAdminRequest.reindexCollection(sourceCollection) + .setTarget(targetCollection); + + TestInjection.reindexFailure = "true:100"; + rsp = req.process(solrClient); + assertNotNull(rsp.getResponse().get("error")); + assertTrue(rsp.toString(), rsp.getResponse().get("error").toString().contains("waiting for daemon")); + + // verify that the target and checkpoint collections don't exist + cloudManager.getClusterStateProvider().getClusterState().forEachCollection(coll -> { + assertFalse(coll.getName() + " still exists", coll.getName().startsWith(ReindexCollectionCmd.TARGET_COL_PREFIX)); + assertFalse(coll.getName() + " still exists", coll.getName().startsWith(ReindexCollectionCmd.CHK_COL_PREFIX)); + }); + // verify that the source collection is read-write and has no reindexing flags + CloudTestUtils.waitForState(cloudManager, "collection state is incorrect", sourceCollection, + ((liveNodes, collectionState) -> + !collectionState.isReadOnly() && + collectionState.getStr(ReindexCollectionCmd.REINDEXING_STATE) == null && + getState(sourceCollection) == null)); + } + + @Test + public void testAbort() throws Exception { + final String sourceCollection = "abortReindexing"; + final String targetCollection = "abortReindexingTarget"; + createCollection(sourceCollection, "conf1", 2, 1); + + TestInjection.reindexLatch = new CountDownLatch(1); + CollectionAdminRequest.ReindexCollection req = CollectionAdminRequest.reindexCollection(sourceCollection) + .setTarget(targetCollection); + String asyncId = req.processAsync(solrClient); + // wait for the source collection to be put in readOnly mode + CloudTestUtils.waitForState(cloudManager, "source collection didn't become readOnly", + sourceCollection, (liveNodes, coll) -> coll.isReadOnly()); + + req = CollectionAdminRequest.reindexCollection(sourceCollection); + req.setCommand("abort"); + CollectionAdminResponse rsp = req.process(solrClient); + Map status = (Map)rsp.getResponse().get(ReindexCollectionCmd.REINDEX_STATUS); + assertNotNull(rsp.toString(), status); + assertEquals(status.toString(), "aborting", status.get("state")); + + CloudTestUtils.waitForState(cloudManager, "incorrect collection state", sourceCollection, + ((liveNodes, collectionState) -> + collectionState.isReadOnly() && + getState(sourceCollection) == ReindexCollectionCmd.State.ABORTED)); + + // verify status + req.setCommand("status"); + rsp = req.process(solrClient); + status = (Map)rsp.getResponse().get(ReindexCollectionCmd.REINDEX_STATUS); + assertNotNull(rsp.toString(), status); + assertEquals(status.toString(), "aborted", status.get("state")); + // let the process continue + TestInjection.reindexLatch.countDown(); + CloudTestUtils.waitForState(cloudManager, "source collection is in wrong state", + sourceCollection, (liveNodes, docCollection) -> !docCollection.isReadOnly() && getState(sourceCollection) == null); + // verify the response + rsp = CollectionAdminRequest.requestStatus(asyncId).process(solrClient); + status = (Map)rsp.getResponse().get(ReindexCollectionCmd.REINDEX_STATUS); + assertNotNull(rsp.toString(), status); + assertEquals(status.toString(), "aborted", status.get("state")); + } + + private void createCollection(String name, String config, int numShards, int numReplicas) throws Exception { + CollectionAdminRequest.createCollection(name, config, numShards, numReplicas) + .setMaxShardsPerNode(-1) + .process(solrClient); + + cluster.waitForActiveCollection(name, numShards, numShards * numReplicas); + } + + private void indexDocs(String collection, int numDocs, Function generator) throws Exception { + List docs = new ArrayList<>(); + for (int i = 0; i < numDocs; i++) { + docs.add(generator.apply(i)); + } + solrClient.add(collection, docs); + solrClient.commit(collection); + // verify the docs exist + QueryResponse rsp = solrClient.query(collection, params(CommonParams.Q, "*:*")); + assertEquals("num docs", NUM_DOCS, rsp.getResults().getNumFound()); + + } +} diff --git a/solr/solr-ref-guide/src/collections-api.adoc b/solr/solr-ref-guide/src/collections-api.adoc index c5f68cc044..3a27ec824e 100644 --- a/solr/solr-ref-guide/src/collections-api.adoc +++ b/solr/solr-ref-guide/src/collections-api.adoc @@ -198,6 +198,7 @@ The attributes that can be modified are: See the <> section above for details on these attributes. +[[readonlymode]] ==== Read-only mode Setting the `readOnly` attribute to `true` puts the collection in read-only mode, in which any index update requests are rejected. Other collection-level actions (eg. adding / @@ -218,6 +219,125 @@ NOTE: This may potentially take a long time if there are still major segment mer Removing the `readOnly` property or setting it to false enables the processing of updates and reloads the collection. +[[reindexcollection]] +== REINDEXCOLLECTION: Re-index a Collection + +`/admin/collections?action=REINDEXCOLLECTION&name=_name_` + +The REINDEXCOLLECTION command re-indexes a collection using existing data from the +source collection. + +NOTE: Re-indexing is potentially a lossy operation - some of the existing indexed data that is not +available as stored fields may be lost, so users should use this command +with caution, evaluating the potential impact by using different source and target +collection names first, and preserving the source collection until the evaluation is +complete. + +The target collection must not exist (and may not be an alias). If the target +collection name is the same as the source collection then first a unique sequential name +will be generated for the target collection, and then after re-indexing is done an alias +will be created that points from the source name to the actual sequentially-named target collection. + +When re-indexing is started the source collection is put in <> to ensure that +all source documents are properly processed. + +Using optional parameters a different index schema, collection shape (number of shards and replicas) +or routing parameters can be requested for the target collection. + +Re-indexing is executed as a streaming expression daemon, which runs on one of the +source collection's replicas. It is usually a time-consuming operation so it's recommended to execute +it as an asynchronous request in order to avoid request time outs. Only one re-indexing operation may +execute concurrently for a given source collection. Long-running, erroneous or crashed re-indexing +operations may be terminated by using the `abort` option, which also removes partial results. + +=== REINDEXCOLLECTION Parameters + +`name`:: +Source collection name, may be an alias. This parameter is required. + +`cmd`:: +Optional command. Default command is `start`. Currently supported commands are: +* `start` - default, starts processing if not already running, +* `abort` - aborts an already running re-indexing (or clears a left-over status after a crash), +and deletes partial results, +* `status` - returns detailed status of a running re-indexing command. + +`target`:: +Target collection name, optional. If not specified a unique name will be generated and +after all documents have been copied an alias will be created that points from the source +collection name to the unique sequentially-named collection, effectively "hiding" +the original source collection from regular update and search operations. + +`q`:: +Optional query to select documents for re-indexing. Default value is `\*:*`. + +`fl`:: +Optional list of fields to re-index. Default value is `*`. + +`rows`:: +Documents are transferred in batches. Depending on the average size of the document large +batch sizes may cause memory issues. Default value is 100. + +`configName`:: +`collection.configName`:: +Optional name of the configset for the target collection. Default is the same as the +source collection. + +There's a number of optional parameters that determine the target collection layout. If they +are not specified in the request then their values are copied from the source collection. +The following parameters are currently supported (described in details in the <> section): +`numShards`, `replicationFactor`, `nrtReplicas`, `tlogReplicas`, `pullReplicas`, `maxShardsPerNode`, +`autoAddReplicas`, `shards`, `policy`, `createNodeSet`, `createNodeSet.shuffle`, `router.*`. + +`removeSource`:: +Optional boolean. If true then after the processing is successfully finished the source collection will +be deleted. + +`async`:: +Optional request ID to track this action which will be <>. + +When the re-indexing process has completed the target collection is marked using +`property.rx: "finished"`, and the source collection state is updated to become read-write. +On any errors the command will delete any temporary and target collections and also reset the +state of the source collection's read-only flag. + +=== Examples using REINDEXCOLLECTION + +*Input* + +[source,text] +---- +http://localhost:8983/solr/admin/collections?action=REINDEXCOLLECTION&name=newCollection&numShards=3&configName=conf2&q=id:aa*&fl=id,string_s +---- +This request specifies a different schema for the target collection, copies only some of the fields, selects only the documents +matching a query, and also potentially re-shapes the collection by explicitly specifying 3 shards. Since the target collection +hasn't been specified in the parameters a collection with a unique name eg. `.rx_newCollection_2` will be created and on success +an alias pointing from `newCollection -> .rx_newCollection_2` will be created, effectively replacing the source collection +for the purpose of indexing and searching. The source collection is assumed to be small so a synchronous request was made. + +*Output* + +[source,json] +---- +{ + "responseHeader":{ + "status":0, + "QTime":10757}, + "reindexStatus":{ + "phase":"done", + "inputDocs":13416, + "processedDocs":376, + "actualSourceCollection":".rx_newCollection_1", + "state":"finished", + "actualTargetCollection":".rx_newCollection_2", + "checkpointCollection":".rx_ck_newCollection" + } +} +---- +As a result a new collection `.rx_newCollection_2` has been created, with selected documents re-indexed to 3 shards, and +with an alias pointing from `newCollection` to this one. The status also shows that the source collection +was already an alias to `.rx_newCollection_1`, which was likely a result of a previous re-indexing. + [[reload]] == RELOAD: Reload a Collection diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/DaemonStream.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/DaemonStream.java index 9d02ec2df7..ec56bfef1e 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/DaemonStream.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/DaemonStream.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.atomic.AtomicLong; import org.apache.solr.client.solrj.io.Tuple; import org.apache.solr.client.solrj.io.comp.StreamComparator; @@ -51,7 +52,7 @@ public class DaemonStream extends TupleStream implements Expressible { private ArrayBlockingQueue queue; private int queueSize; private boolean eatTuples; - private long iterations; + private AtomicLong iterations = new AtomicLong(); private long startTime; private long stopTime; private Exception exception; @@ -240,7 +241,7 @@ public class DaemonStream extends TupleStream implements Expressible { tuple.put(ID, id); tuple.put("startTime", startTime); tuple.put("stopTime", stopTime); - tuple.put("iterations", iterations); + tuple.put("iterations", iterations.get()); tuple.put("state", streamRunner.getState().toString()); if(exception != null) { tuple.put("exception", exception.getMessage()); @@ -253,10 +254,6 @@ public class DaemonStream extends TupleStream implements Expressible { this.daemons = daemons; } - private synchronized void incrementIterations() { - ++iterations; - } - private synchronized void setStartTime(long startTime) { this.startTime = startTime; } @@ -332,7 +329,7 @@ public class DaemonStream extends TupleStream implements Expressible { log.error("Error in DaemonStream:" + id, e); ++errors; if (errors > 100) { - log.error("Too many consectutive errors. Stopping DaemonStream:" + id); + log.error("Too many consecutive errors. Stopping DaemonStream:" + id); break OUTER; } } catch (Throwable t) { @@ -351,7 +348,7 @@ public class DaemonStream extends TupleStream implements Expressible { } } } - incrementIterations(); + iterations.incrementAndGet(); if (sleepMillis > 0) { try { diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java index 16546894a3..11efb3e57b 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java @@ -784,6 +784,90 @@ public abstract class CollectionAdminRequest } /** + * Returns a SolrRequest to reindex a collection + */ + public static ReindexCollection reindexCollection(String collection) { + return new ReindexCollection(collection); + } + + public static class ReindexCollection extends AsyncCollectionSpecificAdminRequest { + String target; + String query; + String fields; + String configName; + Boolean removeSource; + String cmd; + Integer batchSize; + Map collectionParams = new HashMap<>(); + + private ReindexCollection(String collection) { + super(CollectionAction.REINDEXCOLLECTION, collection); + } + + /** Target collection name (null if the same). */ + public ReindexCollection setTarget(String target) { + this.target = target; + return this; + } + + /** Set optional command (eg. abort, status). */ + public ReindexCollection setCommand(String command) { + this.cmd = command; + return this; + } + + /** Query matching the documents to reindex (default is '*:*'). */ + public ReindexCollection setQuery(String query) { + this.query = query; + return this; + } + + /** Fields to reindex (the same syntax as {@link CommonParams#FL}), default is '*'. */ + public ReindexCollection setFields(String fields) { + this.fields = fields; + return this; + } + + /** Remove source collection after success. Default is false. */ + public ReindexCollection setRemoveSource(boolean removeSource) { + this.removeSource = removeSource; + return this; + } + + /** Copy documents in batches of this size. Default is 100. */ + public ReindexCollection setBatchSize(int batchSize) { + this.batchSize = batchSize; + return this; + } + + /** Config name for the target collection. Default is the same as source. */ + public ReindexCollection setConfigName(String configName) { + this.configName = configName; + return this; + } + + /** Set other supported collection CREATE parameters. */ + public ReindexCollection setCollectionParam(String key, Object value) { + this.collectionParams.put(key, value); + return this; + } + + @Override + public SolrParams getParams() { + ModifiableSolrParams params = (ModifiableSolrParams) super.getParams(); + params.setNonNull("target", target); + params.setNonNull("cmd", cmd); + params.setNonNull(ZkStateReader.CONFIGNAME_PROP, configName); + params.setNonNull(CommonParams.Q, query); + params.setNonNull(CommonParams.FL, fields); + params.setNonNull("removeSource", removeSource); + params.setNonNull(CommonParams.ROWS, batchSize); + collectionParams.forEach((k, v) -> params.setNonNull(k, v)); + return params; + } + } + + /** * Return a SolrRequest for low-level detailed status of the collection. */ public static ColStatus collectionStatus(String collection) { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java b/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java index 30778b83ac..c4dad33280 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java @@ -100,6 +100,11 @@ public class CompositeIdRouter extends HashBasedRouter { return targetSlices; } + @Override + public String getName() { + return NAME; + } + public List partitionRangeByKey(String key, Range range) { List result = new ArrayList<>(3); Range keyRange = keyHashRange(key); diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/DocRouter.java b/solr/solrj/src/java/org/apache/solr/common/cloud/DocRouter.java index 111c74b7f0..335c86dbf7 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/DocRouter.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/DocRouter.java @@ -223,6 +223,7 @@ public abstract class DocRouter { public abstract boolean isTargetSlice(String id, SolrInputDocument sdoc, SolrParams params, String shardId, DocCollection collection); + public abstract String getName(); /** This method is consulted to determine what slices should be queried for a request when * an explicit shards parameter was not used. diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ImplicitDocRouter.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ImplicitDocRouter.java index 0b25fcbaba..7e5162162d 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ImplicitDocRouter.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ImplicitDocRouter.java @@ -76,6 +76,11 @@ public class ImplicitDocRouter extends DocRouter { } @Override + public String getName() { + return NAME; + } + + @Override public Collection getSearchSlicesSingle(String shardKey, SolrParams params, DocCollection collection) { if (shardKey == null) { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/PlainIdRouter.java b/solr/solrj/src/java/org/apache/solr/common/cloud/PlainIdRouter.java index f1cea474fd..d63c5cde80 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/PlainIdRouter.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/PlainIdRouter.java @@ -19,4 +19,9 @@ package org.apache.solr.common.cloud; public class PlainIdRouter extends HashBasedRouter { public static final String NAME = "plain"; + + @Override + public String getName() { + return NAME; + } } diff --git a/solr/solrj/src/java/org/apache/solr/common/params/CollectionParams.java b/solr/solrj/src/java/org/apache/solr/common/params/CollectionParams.java index de6b247ded..cfef82c607 100644 --- a/solr/solrj/src/java/org/apache/solr/common/params/CollectionParams.java +++ b/solr/solrj/src/java/org/apache/solr/common/params/CollectionParams.java @@ -123,7 +123,9 @@ public interface CollectionParams { NONE(false, LockLevel.NONE), // TODO: not implemented yet MERGESHARDS(true, LockLevel.SHARD), - COLSTATUS(true, LockLevel.NONE) + COLSTATUS(true, LockLevel.NONE), + // this command implements its own locking + REINDEXCOLLECTION(true, LockLevel.NONE) ; public final boolean isWrite; diff --git a/solr/solrj/src/java/org/apache/solr/common/util/Utils.java b/solr/solrj/src/java/org/apache/solr/common/util/Utils.java index d079052c54..50e7c0c3a5 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/Utils.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/Utils.java @@ -40,6 +40,7 @@ import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.NoSuchElementException; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; @@ -571,7 +572,7 @@ public class Utils { VersionedData data = null; try { data = distribStateManager.getData(path); - } catch (KeeperException.NoNodeException e) { + } catch (KeeperException.NoNodeException | NoSuchElementException e) { return Collections.emptyMap(); } if (data == null || data.getData() == null || data.getData().length == 0) return Collections.emptyMap();