Index: solr/core/src/java/org/apache/solr/cloud/DistributedMap.java =================================================================== --- solr/core/src/java/org/apache/solr/cloud/DistributedMap.java (revision 1595026) +++ solr/core/src/java/org/apache/solr/cloud/DistributedMap.java (working copy) @@ -27,6 +27,7 @@ import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooDefs; import org.apache.zookeeper.data.ACL; +import org.apache.zookeeper.data.Stat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -160,6 +161,12 @@ return zookeeper.exists(dir + "/" + prefix + trackingId, true); } + public int size() throws KeeperException, InterruptedException { + Stat stat = new Stat(); + zookeeper.getData(dir, null, stat, true); + return stat.getNumChildren(); + } + public void remove(String trackingId) throws KeeperException, InterruptedException { zookeeper.delete(dir + "/" + prefix + trackingId, -1, true); } Index: solr/core/src/java/org/apache/solr/cloud/DistributedQueue.java =================================================================== --- solr/core/src/java/org/apache/solr/cloud/DistributedQueue.java (revision 1595026) +++ solr/core/src/java/org/apache/solr/cloud/DistributedQueue.java (working copy) @@ -18,10 +18,6 @@ package org.apache.solr.cloud; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.TreeMap; - import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.cloud.SolrZkClient; @@ -36,6 +32,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Set; +import java.util.TreeMap; + /** * A distributed queue from zk recipes. */ @@ -364,6 +366,83 @@ } } + public List peekTopN(int n, Set excludeSet, Long wait) + throws KeeperException, InterruptedException { + ArrayList topN = new ArrayList<>(); + + boolean waitedEnough = false; + TimerContext time = null; + if (wait == Long.MAX_VALUE) time = stats.time(dir + "_peekTopN_wait_forever"); + else time = stats.time(dir + "_peekTopN_wait" + wait); + + try { + TreeMap orderedChildren; + while (true) { + LatchChildWatcher childWatcher = new LatchChildWatcher(); + try { + orderedChildren = orderedChildren(childWatcher); + } catch (KeeperException.NoNodeException e) { + zookeeper.create(dir, new byte[0], acl, CreateMode.PERSISTENT, true); + continue; + } + + if (orderedChildren.size() == 0) { + if(waitedEnough) return null; + childWatcher.await(wait == Long.MAX_VALUE ? DEFAULT_TIMEOUT : wait); + waitedEnough = wait != Long.MAX_VALUE; + continue; + } + + for (String headNode : orderedChildren.values()) { + if (headNode != null && topN.size() < n) { + try { + String id = dir + "/" + headNode; + if (excludeSet != null && excludeSet.contains(id)) continue; + QueueEvent queueEvent = new QueueEvent(id, + zookeeper.getData(dir + "/" + headNode, null, null, true), null); + topN.add(queueEvent); + } catch (KeeperException.NoNodeException e) { + // Another client removed the node first, try next + } + } else { + if (topN.size() >= 1) { + return topN; + } + } + } + + if (topN.size() > 0 ) return topN; + if (waitedEnough) return null; + childWatcher.await(wait == Long.MAX_VALUE ? DEFAULT_TIMEOUT : wait); + waitedEnough = wait != Long.MAX_VALUE; + } + } finally { + time.stop(); + } + } + + /** + * + * Gets last element of the Queue without removing it. + */ + public String getTailId() throws KeeperException, InterruptedException { + TreeMap orderedChildren = null; + orderedChildren = orderedChildren(null); + if(orderedChildren == null || orderedChildren.isEmpty()) return null; + + for(String headNode : orderedChildren.descendingMap().values()) + if (headNode != null) { + try { + QueueEvent queueEvent = new QueueEvent(dir + "/" + headNode, zookeeper.getData(dir + "/" + headNode, + null, null, true), null); + return queueEvent.getId(); + } catch (KeeperException.NoNodeException e) { + // Another client removed the node first, try next + } + } + return null; + } + public static class QueueEvent { @Override public int hashCode() { Index: solr/core/src/java/org/apache/solr/cloud/Overseer.java =================================================================== --- solr/core/src/java/org/apache/solr/cloud/Overseer.java (revision 1595026) +++ solr/core/src/java/org/apache/solr/cloud/Overseer.java (working copy) @@ -1200,7 +1200,6 @@ } } } finally { - if (ccThread != null) { try { ccThread.close(); @@ -1309,6 +1308,7 @@ public void success(String operation) { String op = operation.toLowerCase(Locale.ROOT); + synchronized (stats) { Stat stat = stats.get(op); if (stat == null) { stat = new Stat(); @@ -1316,9 +1316,11 @@ } stat.success.incrementAndGet(); } + } public void error(String operation) { String op = operation.toLowerCase(Locale.ROOT); + synchronized (stats) { Stat stat = stats.get(op); if (stat == null) { stat = new Stat(); @@ -1326,20 +1328,26 @@ } stat.errors.incrementAndGet(); } + } public TimerContext time(String operation) { String op = operation.toLowerCase(Locale.ROOT); - Stat stat = stats.get(op); + Stat stat; + synchronized (stats) { + stat = stats.get(op); if (stat == null) { stat = new Stat(); stats.put(op, stat); } + } return stat.requestTime.time(); } public void storeFailureDetails(String operation, ZkNodeProps request, SolrResponse resp) { String op = operation.toLowerCase(Locale.ROOT); - Stat stat = stats.get(op); + Stat stat ; + synchronized (stats) { + stat = stats.get(op); if (stat == null) { stat = new Stat(); stats.put(op, stat); @@ -1352,6 +1360,7 @@ failedOps.addLast(new FailedOp(request, resp)); } } + } public List getFailureDetails(String operation) { Stat stat = stats.get(operation.toLowerCase(Locale.ROOT)); Index: solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java =================================================================== --- solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java (revision 1595026) +++ solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java (working copy) @@ -56,9 +56,11 @@ import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.StrUtils; import org.apache.solr.handler.component.ShardHandler; +import org.apache.solr.handler.component.ShardHandlerFactory; import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.handler.component.ShardResponse; import org.apache.solr.update.SolrIndexSplitter; +import org.apache.solr.util.DefaultSolrThreadFactory; import org.apache.solr.util.stats.Snapshot; import org.apache.solr.util.stats.Timer; import org.apache.solr.util.stats.TimerContext; @@ -74,11 +76,15 @@ import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.SynchronousQueue; +import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import static org.apache.solr.cloud.Assign.Node; @@ -118,8 +124,6 @@ public static final String DELETESHARD = "deleteshard"; - public static final String REQUESTSTATUS = "status"; - public static final String ROUTER = "router"; public static final String SHARDS_PROP = "shards"; @@ -138,6 +142,8 @@ public static final String COLL_PROP_PREFIX = "property."; + public int maxParallelThreads = 10; + public static final Set KNOWN_CLUSTER_PROPS = ImmutableSet.of(ZkStateReader.LEGACY_CLOUD, ZkStateReader.URL_SCHEME); public static final Map COLL_PROPS = ZkNodeProps.makeMap( @@ -146,8 +152,7 @@ MAX_SHARDS_PER_NODE, "1" ); - // TODO: use from Overseer? - private static final String QUEUE_OPERATION = "operation"; + public ExecutorService tpe ; private static Logger log = LoggerFactory .getLogger(OverseerCollectionProcessor.class); @@ -157,9 +162,13 @@ private DistributedMap completedMap; private DistributedMap failureMap; + final private Set runningTasks; + final private Set collectionWip; + final private HashMap completedTasks; + private String myId; - private ShardHandler shardHandler; + private final ShardHandlerFactory shardHandlerFactory; private String adminPath; @@ -169,13 +178,19 @@ private Overseer.Stats stats; - public OverseerCollectionProcessor(ZkStateReader zkStateReader, String myId, ShardHandler shardHandler, String adminPath, Overseer.Stats stats) { - this(zkStateReader, myId, shardHandler, adminPath, stats, Overseer.getCollectionQueue(zkStateReader.getZkClient(), stats), + final private Set processedZKTasks; + private final Object waitLock = new Object(); + + public OverseerCollectionProcessor(ZkStateReader zkStateReader, String myId, + final ShardHandler shardHandler, + String adminPath, Overseer.Stats stats) { + this(zkStateReader, myId, shardHandler.getShardHandlerFactory(), adminPath, stats, Overseer.getCollectionQueue(zkStateReader.getZkClient(), stats), Overseer.getRunningMap(zkStateReader.getZkClient()), Overseer.getCompletedMap(zkStateReader.getZkClient()), Overseer.getFailureMap(zkStateReader.getZkClient())); } - protected OverseerCollectionProcessor(ZkStateReader zkStateReader, String myId, ShardHandler shardHandler, + protected OverseerCollectionProcessor(ZkStateReader zkStateReader, String myId, + final ShardHandlerFactory shardHandlerFactory, String adminPath, Overseer.Stats stats, DistributedQueue workQueue, @@ -184,107 +199,185 @@ DistributedMap failureMap) { this.zkStateReader = zkStateReader; this.myId = myId; - this.shardHandler = shardHandler; + this.shardHandlerFactory = shardHandlerFactory; this.adminPath = adminPath; this.workQueue = workQueue; this.runningMap = runningMap; this.completedMap = completedMap; this.failureMap = failureMap; this.stats = stats; + this.processedZKTasks = new HashSet<>(); + this.runningTasks = new HashSet(); + this.collectionWip = new HashSet(); + this.completedTasks = new HashMap<>(); } @Override public void run() { - log.info("Process current queue of collection creations"); - LeaderStatus isLeader = amILeader(); - while (isLeader == LeaderStatus.DONT_KNOW) { - log.debug("am_i_leader unclear {}", isLeader); - isLeader = amILeader(); // not a no, not a yes, try ask again - } + log.info("Process current queue of collection creations"); + LeaderStatus isLeader = amILeader(); + while (isLeader == LeaderStatus.DONT_KNOW) { + log.debug("am_i_leader unclear {}", isLeader); + isLeader = amILeader(); // not a no, not a yes, try ask again + } + String oldestItemInWorkQueue = null; + // hasLeftOverItems - used for avoiding re-execution of async tasks that were processed by a previous Overseer. + // This variable is set in case there's any task found on the workQueue when the OCP starts up and + // the id for the queue tail is used as a marker to check for the task in completed/failed map in zk. + // Beyond the marker, all tasks can safely be assumed to have never been executed. + boolean hasLeftOverItems = true; + + try { + oldestItemInWorkQueue = workQueue.getTailId(); + } catch (KeeperException e) { + // We don't need to handle this. This is just a fail-safe which comes in handy in skipping already processed + // async calls. + SolrException.log(log, "", e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + + if (oldestItemInWorkQueue == null) + hasLeftOverItems = false; + + try { + prioritizeOverseerNodes(); + } catch (Exception e) { + log.error("Unable to prioritize overseer ", e); + } + + this.tpe = new ThreadPoolExecutor(5, 100, 0L, TimeUnit.MILLISECONDS, + new SynchronousQueue(), + new DefaultSolrThreadFactory("OverseerThreadFactory")); try { - prioritizeOverseerNodes(); - } catch (Exception e) { - log.error("Unable to prioritize overseer ", e); + while (!this.isClosed) { + try { + isLeader = amILeader(); + if (LeaderStatus.NO == isLeader) { + break; + } else if (LeaderStatus.YES != isLeader) { + log.debug("am_i_leader unclear {}", isLeader); + continue; // not a no, not a yes, try asking again + } - } - while (!this.isClosed) { - try { - isLeader = amILeader(); - if (LeaderStatus.NO == isLeader) { - break; - } - else if (LeaderStatus.YES != isLeader) { - log.debug("am_i_leader unclear {}", isLeader); - continue; // not a no, not a yes, try asking again - } - - QueueEvent head = workQueue.peek(true); - if(isClosed) break; - final ZkNodeProps message = ZkNodeProps.load(head.getBytes()); + cleanUpWorkQueue(); - final String asyncId = (message.containsKey(ASYNC) && message.get(ASYNC) != null) ? (String) message.get(ASYNC) : null; + boolean waited = false; - try { - if(message.containsKey(ASYNC) && message.get(ASYNC) != null && !runningMap.contains(message.getStr(ASYNC))) - runningMap.put(asyncId, null); - } catch (KeeperException.NodeExistsException e) { - // Just catch and do nothing. The runningMap.contains(..) check ensures that this is the only - // entry point into the runningMap. - // NOTE: Make sure to handle it as soon as OCP gets distributed/multi-threaded. - } + while (runningTasks.size() > maxParallelThreads) { + synchronized (waitLock) { + waitLock.wait(100);//wait for 100 ms or till a task is complete + } + waited = true; + } - log.info("Overseer Collection Processor: Get the message id:" + head.getId() + " message:" + message.toString()); - final String operation = message.getStr(QUEUE_OPERATION); - final TimerContext timerContext = stats.time("collection_" + operation); // even if operation is async, it is sync! - SolrResponse response = null; - try { - response = processMessage(message, operation); - } finally { - timerContext.stop(); - } + if (waited) + cleanUpWorkQueue(); - head.setBytes(SolrResponse.serializable(response)); - if (!operation.equals(REQUESTSTATUS) && asyncId != null) { - if(response.getResponse().get("failure") != null || response.getResponse().get("exception") != null) { - failureMap.put(asyncId, null); - } else { - completedMap.put(asyncId, null); - } - } - if(asyncId != null) - runningMap.remove(asyncId); + List heads = workQueue.peekTopN(maxParallelThreads, processedZKTasks, 2000L); - workQueue.remove(head); + if (heads == null) + continue; - if (response.getResponse().get("failure") != null || response.getResponse().get("exception") != null) { - stats.error("collection_" + operation); - stats.storeFailureDetails("collection_" + operation, message, response); - } else { - stats.success("collection_" + operation); - } + if (isClosed) break; - log.info("Overseer Collection Processor: Message id:" + head.getId() + " complete, response:"+ response.getResponse().toString()); + for (QueueEvent head : heads) { + final ZkNodeProps message = ZkNodeProps.load(head.getBytes()); + String collectionName = message.containsKey(COLLECTION_PROP) ? + message.getStr(COLLECTION_PROP) : message.getStr("name"); + String asyncId = message.getStr(ASYNC); + if (hasLeftOverItems) { + if (head.getId().equals(oldestItemInWorkQueue)) + hasLeftOverItems = false; + if (asyncId != null && (completedMap.contains(asyncId) || failureMap.contains(asyncId))) { + workQueue.remove(head); + continue; + } + } + + if (!checkExclusivity(message, head.getId())) + continue; + + try { + markTaskAsRunning(head, collectionName, asyncId); + } catch (KeeperException.NodeExistsException e) { + // This should never happen + log.error("Tried to pick up task {} when it was already running!", head.getId()); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + + log.info("Overseer Collection Processor: Get the message id:" + head.getId() + " message:" + message.toString()); + String operation = message.getStr(Overseer.QUEUE_OPERATION); + Runner runner = new Runner(message, + operation, head); + tpe.execute(runner); + } + } catch (KeeperException e) { if (e.code() == KeeperException.Code.SESSIONEXPIRED || e.code() == KeeperException.Code.CONNECTIONLOSS) { - log.warn("Overseer cannot talk to ZK"); - return; - } - SolrException.log(log, "", e); - throw new ZooKeeperException( - SolrException.ErrorCode.SERVER_ERROR, "", e); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - return; - } catch (Exception e) { - SolrException.log(log, "", e); - } - } + log.warn("Overseer cannot talk to ZK"); + return; + } + SolrException.log(log, "", e); + throw new ZooKeeperException( + SolrException.ErrorCode.SERVER_ERROR, "", e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } catch (Exception e) { + SolrException.log(log, "", e); + } + } + } finally { + this.close(); + } } - + + private boolean checkExclusivity(ZkNodeProps message, String id) throws KeeperException, InterruptedException { + String collectionName = message.containsKey(COLLECTION_PROP) ? + message.getStr(COLLECTION_PROP) : message.getStr("name"); + + if(collectionName == null) + return true; + + if(collectionWip.contains(collectionName)) + return false; + + if(processedZKTasks.contains(id)) + return false; + + return true; + } + + private void cleanUpWorkQueue() throws KeeperException, InterruptedException { + synchronized (completedTasks) { + for (String id : completedTasks.keySet()) { + workQueue.remove(completedTasks.get(id)); + processedZKTasks.remove(id); + } + completedTasks.clear(); + } + } + public void close() { isClosed = true; + if(tpe != null) { + if (!tpe.isShutdown()) { + tpe.shutdown(); + try { + tpe.awaitTermination(60, TimeUnit.SECONDS); + } catch (InterruptedException e) { + log.warn("Thread interrupted while waiting for OCP threadpool shutdown."); + Thread.currentThread().interrupt(); + } finally { + if (!tpe.isShutdown()) + tpe.shutdownNow(); + } + } + } } private void prioritizeOverseerNodes() throws KeeperException, InterruptedException { @@ -302,9 +395,8 @@ if(nodeNames.size()<2) return; boolean designateIsInFront = overseerDesignates.contains( nodeNames.get(0)); -// ArrayList nodesTobePushedBack = new ArrayList<>(); - //ensure that the node right behind the leader , i.r at position 1 is a Overseer + //ensure that the node right behind the leader , i.e at position 1 is a Overseer List availableDesignates = new ArrayList<>(); log.info("sorted nodes {}", nodeNames);//TODO to be removed @@ -409,6 +501,7 @@ private void invokeOverseerOp(String nodeName, String op) { ModifiableSolrParams params = new ModifiableSolrParams(); + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); params.set(CoreAdminParams.ACTION, CoreAdminAction.OVERSEEROP.toString()); params.set("op", op); params.set("qt", adminPath); @@ -443,8 +536,8 @@ log.info("According to ZK I (id=" + myId + ") am no longer a leader."); return LeaderStatus.NO; } - - + + protected SolrResponse processMessage(ZkNodeProps message, String operation) { log.warn("OverseerCollectionProcessor.processMessage : "+ operation + " , "+ message.toString()); @@ -481,12 +574,11 @@ } else if(LIST.isEqual(operation)) { listCollections(zkStateReader.getClusterState(), results); } else if (CLUSTERSTATUS.isEqual(operation)) { - getClusterStatus(zkStateReader.getClusterState(), message, results); + getClusterStatus(zkStateReader.getClusterState(), message, results); } else { throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown operation:" + operation); } - } catch (Exception e) { SolrException.log(log, "Collection " + operation + " of " + operation + " failed", e); @@ -495,8 +587,7 @@ nl.add("msg", e.getMessage()); nl.add("rspCode", e instanceof SolrException ? ((SolrException)e).code() : -1); results.add("exception", nl); - } - + } return new OverseerSolrResponse(results); } @@ -518,16 +609,16 @@ NamedList stateUpdateQueueStats = new NamedList(); NamedList workQueueStats = new NamedList(); NamedList collectionQueueStats = new NamedList(); - for (Map.Entry entry : this.stats.getStats().entrySet()) { + for (Map.Entry entry : stats.getStats().entrySet()) { String key = entry.getKey(); NamedList lst = new SimpleOrderedMap<>(); if (key.startsWith("collection_")) { collectionStats.add(key.substring(11), lst); - int successes = this.stats.getSuccessCount(entry.getKey()); - int errors = this.stats.getErrorCount(entry.getKey()); + int successes = stats.getSuccessCount(entry.getKey()); + int errors = stats.getErrorCount(entry.getKey()); lst.add("requests", successes); lst.add("errors", errors); - List failureDetails = this.stats.getFailureDetails(key); + List failureDetails = stats.getFailureDetails(key); if (failureDetails != null) { List> failures = new ArrayList<>(); for (Overseer.FailedOp failedOp : failureDetails) { @@ -547,8 +638,8 @@ } else { // overseer stats overseerStats.add(key, lst); - int successes = this.stats.getSuccessCount(entry.getKey()); - int errors = this.stats.getErrorCount(entry.getKey()); + int successes = stats.getSuccessCount(entry.getKey()); + int errors = stats.getErrorCount(entry.getKey()); lst.add("requests", successes); lst.add("errors", errors); } @@ -751,6 +842,7 @@ String replicaName = message.getStr(REPLICA_PROP); DocCollection coll = clusterState.getCollection(collectionName); Slice slice = coll.getSlice(shard); + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); if(slice==null){ throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid shard name : "+shard+" in collection : "+ collectionName); } @@ -780,7 +872,8 @@ log.warn("Exception trying to unload core " + sreq, e); } - collectShardResponses(!Slice.ACTIVE.equals(replica.getStr(Slice.STATE)) ? new NamedList() : results, false, null); + collectShardResponses(!Slice.ACTIVE.equals(replica.getStr(Slice.STATE)) ? new NamedList() : results, + false, null, shardHandler); if (waitForCoreNodeGone(collectionName, shard, replicaName, 5000)) return;//check if the core unload removed the corenode zk enry deleteCoreNode(collectionName, replicaName, replica, core); // try and ensure core info is removed from clusterstate @@ -973,7 +1066,8 @@ } - private boolean createShard(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { + private boolean createShard(ClusterState clusterState, ZkNodeProps message, NamedList results) + throws KeeperException, InterruptedException { log.info("Create shard invoked: {}", message); String collectionName = message.getStr(COLLECTION_PROP); String shard = message.getStr(SHARD_ID_PROP); @@ -981,6 +1075,7 @@ throw new SolrException(ErrorCode.BAD_REQUEST, "'collection' and 'shard' are required parameters" ); int numSlices = 1; + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); DocCollection collection = clusterState.getCollection(collectionName); int maxShardsPerNode = collection.getInt(MAX_SHARDS_PER_NODE, 1); int repFactor = message.getInt(REPLICATION_FACTOR, collection.getInt(REPLICATION_FACTOR, 1)); @@ -1033,7 +1128,7 @@ } - processResponses(results); + processResponses(results, shardHandler); log.info("Finished create command on all shards for collection: " + collectionName); @@ -1047,6 +1142,7 @@ String collectionName = message.getStr("collection"); String slice = message.getStr(ZkStateReader.SHARD_ID_PROP); String splitKey = message.getStr("split.key"); + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); DocCollection collection = clusterState.getCollection(collectionName); DocRouter router = collection.getRouter() != null ? collection.getRouter() : DocRouter.DEFAULT; @@ -1192,7 +1288,7 @@ // do not abort splitshard if the unloading fails // this can happen because the replicas created previously may be down // the only side effect of this is that the sub shard may end up having more replicas than we want - collectShardResponses(results, false, null); + collectShardResponses(results, false, null, shardHandler); String asyncId = message.getStr(ASYNC); HashMap requestMap = new HashMap(); @@ -1225,11 +1321,11 @@ params.set(CoreAdminParams.SHARD, subSlice); setupAsyncRequest(asyncId, requestMap, params, nodeName); addPropertyParams(message, params); - sendShardRequest(nodeName, params); + sendShardRequest(nodeName, params, shardHandler); } collectShardResponses(results, true, - "SPLITSHARD failed to create subshard leaders"); + "SPLITSHARD failed to create subshard leaders", shardHandler); completeAsyncRequest(asyncId, requestMap, results); @@ -1248,11 +1344,11 @@ ModifiableSolrParams p = new ModifiableSolrParams(cmd.getParams()); setupAsyncRequest(asyncId, requestMap, p, nodeName); - sendShardRequest(nodeName, p); + sendShardRequest(nodeName, p, shardHandler); } collectShardResponses(results, true, - "SPLITSHARD timed out waiting for subshard leaders to come up"); + "SPLITSHARD timed out waiting for subshard leaders to come up", shardHandler); completeAsyncRequest(asyncId, requestMap, results); @@ -1273,9 +1369,10 @@ params.set(CoreAdminParams.RANGES, rangesStr); setupAsyncRequest(asyncId, requestMap, params, parentShardLeader.getNodeName()); - sendShardRequest(parentShardLeader.getNodeName(), params); + sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler); - collectShardResponses(results, true, "SPLITSHARD failed to invoke SPLIT core admin command"); + collectShardResponses(results, true, "SPLITSHARD failed to invoke SPLIT core admin command", + shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Index on shard: " + nodeName + " split into two successfully"); @@ -1292,11 +1389,12 @@ setupAsyncRequest(asyncId, requestMap, params, nodeName); - sendShardRequest(nodeName, params); + sendShardRequest(nodeName, params, shardHandler); } collectShardResponses(results, true, - "SPLITSHARD failed while asking sub shard leaders to apply buffered updates"); + "SPLITSHARD failed while asking sub shard leaders to apply buffered updates", + shardHandler); completeAsyncRequest(asyncId, requestMap, results); @@ -1362,7 +1460,7 @@ //Not using this property. Do we really need to use it? //params.set(ZkStateReader.NUM_SHARDS_PROP, numSlices); - sendShardRequest(subShardNodeName, params); + sendShardRequest(subShardNodeName, params, shardHandler); String coreNodeName = waitForCoreNodeName(collection, subShardNodeName, shardName); // wait for the replicas to be seen as active on sub shard leader @@ -1378,13 +1476,14 @@ setupAsyncRequest(asyncId, requestMap, p, nodeName); - sendShardRequest(nodeName, p); + sendShardRequest(nodeName, p, shardHandler); } } collectShardResponses(results, true, - "SPLITSHARD failed to create subshard replicas or timed out waiting for them to come up"); + "SPLITSHARD failed to create subshard replicas or timed out waiting for them to come up", + shardHandler); completeAsyncRequest(asyncId, requestMap, results); @@ -1442,6 +1541,7 @@ } } + static UpdateResponse softCommit(String url) throws SolrServerException, IOException { HttpSolrServer server = null; try { @@ -1488,7 +1588,9 @@ throw new SolrException(ErrorCode.SERVER_ERROR, "Could not find coreNodeName"); } - private void collectShardResponses(NamedList results, boolean abortOnError, String msgOnError) { + private void collectShardResponses(NamedList results, boolean abortOnError, + String msgOnError, + ShardHandler shardHandler) { ShardResponse srsp; do { srsp = shardHandler.takeCompletedOrError(); @@ -1529,14 +1631,15 @@ "The slice: " + slice.getName() + " is currently " + slice.getState() + ". Only INACTIVE (or custom-hashed) slices can be deleted."); } + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); try { ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.UNLOAD.toString()); params.set(CoreAdminParams.DELETE_INDEX, "true"); - sliceCmd(clusterState, params, null, slice); + sliceCmd(clusterState, params, null, slice, shardHandler); - processResponses(results); + processResponses(results, shardHandler); ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, Overseer.REMOVESHARD, ZkStateReader.COLLECTION_PROP, collection, @@ -1622,7 +1725,7 @@ if (clusterState.hasCollection(tempSourceCollectionName)) { log.info("Deleting temporary collection: " + tempSourceCollectionName); Map props = ZkNodeProps.makeMap( - QUEUE_OPERATION, DELETECOLLECTION, + Overseer.QUEUE_OPERATION, DELETECOLLECTION, "name", tempSourceCollectionName); try { @@ -1635,6 +1738,8 @@ CompositeIdRouter sourceRouter = (CompositeIdRouter) sourceCollection.getRouter(); DocRouter.Range keyHashRange = sourceRouter.keyHashRange(splitKey); + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); + log.info("Hash range for split.key: {} is: {}", splitKey, keyHashRange); // intersect source range, keyHashRange and target range // this is the range that has to be split from source and transferred to target @@ -1657,9 +1762,10 @@ String nodeName = targetLeader.getNodeName(); setupAsyncRequest(asyncId, requestMap, params, nodeName); - sendShardRequest(targetLeader.getNodeName(), params); + sendShardRequest(targetLeader.getNodeName(), params, shardHandler); - collectShardResponses(results, true, "MIGRATE failed to request node to buffer updates"); + collectShardResponses(results, true, "MIGRATE failed to request node to buffer updates", + shardHandler); completeAsyncRequest(asyncId, requestMap, results); @@ -1703,7 +1809,7 @@ // create a temporary collection with just one node on the shard leader String configName = zkStateReader.readConfigName(sourceCollection.getName()); Map props = ZkNodeProps.makeMap( - QUEUE_OPERATION, CREATECOLLECTION, + Overseer.QUEUE_OPERATION, CREATECOLLECTION, "name", tempSourceCollectionName, REPLICATION_FACTOR, 1, NUM_SLICES, 1, @@ -1733,10 +1839,11 @@ cmd.setState(ZkStateReader.ACTIVE); cmd.setCheckLive(true); cmd.setOnlyIfLeader(true); - sendShardRequest(tempSourceLeader.getNodeName(), new ModifiableSolrParams(cmd.getParams())); + sendShardRequest(tempSourceLeader.getNodeName(), new ModifiableSolrParams(cmd.getParams()), shardHandler); collectShardResponses(results, true, - "MIGRATE failed to create temp collection leader or timed out waiting for it to come up"); + "MIGRATE failed to create temp collection leader or timed out waiting for it to come up", + shardHandler); log.info("Asking source leader to split index"); params = new ModifiableSolrParams(); @@ -1750,8 +1857,8 @@ setupAsyncRequest(asyncId, requestMap, params, tempNodeName); - sendShardRequest(tempNodeName, params); - collectShardResponses(results, true, "MIGRATE failed to invoke SPLIT core admin command"); + sendShardRequest(tempNodeName, params, shardHandler); + collectShardResponses(results, true, "MIGRATE failed to invoke SPLIT core admin command", shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Creating a replica of temporary collection: {} on the target leader node: {}", @@ -1764,9 +1871,10 @@ params.set(CoreAdminParams.SHARD, tempSourceSlice.getName()); setupAsyncRequest(asyncId, requestMap, params, targetLeader.getNodeName()); - sendShardRequest(targetLeader.getNodeName(), params); + sendShardRequest(targetLeader.getNodeName(), params, shardHandler); collectShardResponses(results, true, - "MIGRATE failed to create replica of temporary collection in target leader node."); + "MIGRATE failed to create replica of temporary collection in target leader node.", + shardHandler); completeAsyncRequest(asyncId, requestMap, results); @@ -1785,10 +1893,11 @@ setupAsyncRequest(asyncId, requestMap, params, tempSourceLeader.getNodeName()); - sendShardRequest(tempSourceLeader.getNodeName(), params); + sendShardRequest(tempSourceLeader.getNodeName(), params, shardHandler); collectShardResponses(results, true, - "MIGRATE failed to create temp collection replica or timed out waiting for them to come up"); + "MIGRATE failed to create temp collection replica or timed out waiting for them to come up", + shardHandler); completeAsyncRequest(asyncId, requestMap, results); log.info("Successfully created replica of temp source collection on target leader node"); @@ -1801,9 +1910,11 @@ setupAsyncRequest(asyncId, requestMap, params, sourceLeader.getNodeName()); - sendShardRequest(targetLeader.getNodeName(), params); + sendShardRequest(targetLeader.getNodeName(), params, shardHandler); collectShardResponses(results, true, - "MIGRATE failed to merge " + tempCollectionReplica2 + " to " + targetLeader.getStr("core") + " on node: " + targetLeader.getNodeName()); + "MIGRATE failed to merge " + tempCollectionReplica2 + + " to " + targetLeader.getStr("core") + " on node: " + targetLeader.getNodeName(), + shardHandler); completeAsyncRequest(asyncId, requestMap, results); @@ -1813,16 +1924,17 @@ params.set(CoreAdminParams.NAME, targetLeader.getStr("core")); setupAsyncRequest(asyncId, requestMap, params, targetLeader.getNodeName()); - sendShardRequest(targetLeader.getNodeName(), params); + sendShardRequest(targetLeader.getNodeName(), params, shardHandler); collectShardResponses(results, true, - "MIGRATE failed to request node to apply buffered updates"); + "MIGRATE failed to request node to apply buffered updates", + shardHandler); completeAsyncRequest(asyncId, requestMap, results); try { log.info("Deleting temporary collection: " + tempSourceCollectionName); props = ZkNodeProps.makeMap( - QUEUE_OPERATION, DELETECOLLECTION, + Overseer.QUEUE_OPERATION, DELETECOLLECTION, "name", tempSourceCollectionName); deleteCollection(new ZkNodeProps(props), results); } catch (Exception e) { @@ -1860,7 +1972,7 @@ } } - private void sendShardRequest(String nodeName, ModifiableSolrParams params) { + private void sendShardRequest(String nodeName, ModifiableSolrParams params, ShardHandler shardHandler) { ShardRequest sreq = new ShardRequest(); params.set("qt", adminPath); sreq.purpose = 1; @@ -1880,6 +1992,7 @@ } } } + private void createCollection(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException { String collectionName = message.getStr("name"); if (clusterState.hasCollection(collectionName)) { @@ -1892,9 +2005,9 @@ int repFactor = message.getInt( REPLICATION_FACTOR, 1); + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); String async = null; - if (message.containsKey("async")) - async = message.getStr("async"); + async = message.getStr("async"); Integer numSlices = message.getInt(NUM_SLICES, null); String router = message.getStr("router.name", DocRouter.DEFAULT_NAME); @@ -1984,7 +2097,8 @@ // For tracking async calls. HashMap requestMap = new HashMap(); - log.info("Creating SolrCores for new collection, shardNames {} , replicationFactor : {}", shardNames, repFactor); + log.info("Creating SolrCores for new collection {}, shardNames {} , replicationFactor : {}", + collectionName, shardNames, repFactor); Map coresToCreate = new LinkedHashMap<>(); for (int i = 1; i <= shardNames.size(); i++) { String sliceName = shardNames.get(i-1); @@ -2049,7 +2163,7 @@ } } - processResponses(results); + processResponses(results, shardHandler); completeAsyncRequest(async, requestMap, results); @@ -2107,6 +2221,7 @@ throw new SolrException(ErrorCode.BAD_REQUEST, "Collection: " + collection + " shard: " + shard + " does not exist"); } + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); if(node== null){ node = getNodesForNewShard(clusterState,collection, coll.getSlices().size() , coll.getInt(MAX_SHARDS_PER_NODE, 1),coll.getInt(REPLICATION_FACTOR, 1),null).get(0).nodeName; @@ -2179,14 +2294,13 @@ } addPropertyParams(message, params); - sendShardRequest(node, params); + sendShardRequest(node, params, shardHandler); collectShardResponses(results, true, - "ADDREPLICA failed to create replica"); + "ADDREPLICA failed to create replica", shardHandler); } - - private void processResponses(NamedList results) { + private void processResponses(NamedList results, ShardHandler shardHandler) { ShardResponse srsp; do { srsp = shardHandler.takeCompletedOrError(); @@ -2196,7 +2310,6 @@ } while (srsp != null); } - private String createConfNode(String coll, ZkNodeProps message, boolean isLegacyCloud) throws KeeperException, InterruptedException { String configName = message.getStr(OverseerCollectionProcessor.COLL_CONF); if(configName == null){ @@ -2234,19 +2347,21 @@ private void collectionCmd(ClusterState clusterState, ZkNodeProps message, ModifiableSolrParams params, NamedList results, String stateMatcher) { log.info("Executing Collection Cmd : " + params); String collectionName = message.getStr("name"); + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); DocCollection coll = clusterState.getCollection(collectionName); for (Map.Entry entry : coll.getSlicesMap().entrySet()) { Slice slice = entry.getValue(); - sliceCmd(clusterState, params, stateMatcher, slice); + sliceCmd(clusterState, params, stateMatcher, slice, shardHandler); } - processResponses(results); + processResponses(results, shardHandler); } - private void sliceCmd(ClusterState clusterState, ModifiableSolrParams params, String stateMatcher, Slice slice) { + private void sliceCmd(ClusterState clusterState, ModifiableSolrParams params, String stateMatcher, + Slice slice, ShardHandler shardHandler) { Map shards = slice.getReplicasMap(); Set> shardEntries = shards.entrySet(); for (Map.Entry shardEntry : shardEntries) { @@ -2314,12 +2429,13 @@ private void waitForAsyncCallsToComplete(Map requestMap, NamedList results) { for(String k:requestMap.keySet()) { - log.debug("I am Waiting for : " + k + "/" + requestMap.get(k)); + log.debug("I am Waiting for :{}/{}", k, requestMap.get(k)); results.add(requestMap.get(k), waitForCoreAdminAsyncCallToComplete(k, requestMap.get(k))); } } private NamedList waitForCoreAdminAsyncCallToComplete(String nodeName, String requestId) { + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminAction.REQUESTSTATUS.toString()); params.set(CoreAdminParams.REQUESTID, requestId); @@ -2348,7 +2464,7 @@ try { Thread.sleep(1000); } catch (InterruptedException e) { - e.printStackTrace(); + Thread.currentThread().interrupt(); } continue; @@ -2377,6 +2493,146 @@ } while (srsp != null); } while(true); } + + private void markTaskAsRunning(QueueEvent head, String collectionName,String asyncId) + throws KeeperException, InterruptedException { + synchronized (processedZKTasks) { + processedZKTasks.add(head.getId()); + } + + synchronized (runningTasks) { + runningTasks.add(head.getId()); + } + if(collectionName != null) { + synchronized (collectionWip) { + collectionWip.add(collectionName); + } + } + + if(asyncId != null) + runningMap.put(asyncId, null); + } + + protected class Runner implements Runnable { + ZkNodeProps message; + String operation; + SolrResponse response; + QueueEvent head; + + public Runner(ZkNodeProps message, String operation, QueueEvent head) { + this.message = message; + this.operation = operation; + this.head = head; + response = null; + } + + + @Override + public void run() { + + final TimerContext timerContext = stats.time("collection_" + operation); + + String asyncId = message.getStr(ASYNC); + String collectionName = message.containsKey(COLLECTION_PROP) ? + message.getStr(COLLECTION_PROP) : message.getStr("name"); + try { + try { + response = processMessage(message, operation); + } finally { + timerContext.stop(); + updateStats(); + } + + if(asyncId != null) { + if (response != null && (response.getResponse().get("failure") != null || response.getResponse().get("exception") != null)) { + failureMap.put(asyncId, null); + } else { + completedMap.put(asyncId, null); + } + } else { + head.setBytes(SolrResponse.serializable(response)); + } + + markTaskComplete(head.getId(), asyncId, collectionName); + + log.info("Overseer Collection Processor: Message id:" + head.getId() + + " complete, response:" + response.getResponse().toString()); + } catch (KeeperException e) { + // Reset task from tracking data structures so that it can be retried. + resetTaskWithException(head.getId(), asyncId, collectionName); + SolrException.log(log, "", e); + } catch (InterruptedException e) { + // Reset task from tracking data structures so that it can be retried. + resetTaskWithException(head.getId(), asyncId, collectionName); + Thread.currentThread().interrupt(); + }finally { + synchronized (waitLock){ + waitLock.notifyAll(); + } + } + } + + private void markTaskComplete(String id, String asyncId, String collectionName) + throws KeeperException, InterruptedException { + synchronized (completedTasks) { + completedTasks.put(id, head); + } + + synchronized (runningTasks) { + runningTasks.remove(id); + } + + if(asyncId != null) + runningMap.remove(asyncId); + + if(collectionName != null) { + synchronized (collectionWip) { + collectionWip.remove(collectionName); + } + } + } + + private void resetTaskWithException(String id, String asyncId, String collectionName) { + log.warn("Resetting task: {}, requestid: {}, collectionName: {}", id, asyncId, collectionName); + try { + if (asyncId != null) + runningMap.remove(asyncId); + + synchronized (runningTasks) { + runningTasks.remove(id); + } + + if (collectionName != null) { + synchronized (collectionWip) { + collectionWip.remove(collectionName); + } + } + } catch (KeeperException e) { + SolrException.log(log, "", e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + + } + + private void updateStats() { + if (isSuccessful()) { + stats.success("collection_" + operation); + } else { + stats.error("collection_" + operation); + stats.storeFailureDetails("collection_" + operation, message, response); + } + } + + private boolean isSuccessful() { + return !(response.getResponse().get("failure") != null || response.getResponse().get("exception") != null); + } + } + + + + + String getId(){ return myId; } Index: solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java =================================================================== --- solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java (revision 1595026) +++ solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java (working copy) @@ -283,16 +283,16 @@ success.add("state", "completed"); success.add("msg", "found " + requestId + " in completed tasks"); results.add("status", success); + } else if (coreContainer.getZkController().getOverseerFailureMap().contains(requestId)) { + SimpleOrderedMap success = new SimpleOrderedMap(); + success.add("state", "failed"); + success.add("msg", "found " + requestId + " in failed tasks"); + results.add("status", success); } else if (coreContainer.getZkController().getOverseerRunningMap().contains(requestId)) { SimpleOrderedMap success = new SimpleOrderedMap(); success.add("state", "running"); success.add("msg", "found " + requestId + " in submitted tasks"); results.add("status", success); - } else if (coreContainer.getZkController().getOverseerFailureMap().contains(requestId)) { - SimpleOrderedMap success = new SimpleOrderedMap(); - success.add("state", "failed"); - success.add("msg", "found " + requestId + " in failed tasks"); - results.add("status", success); } else { SimpleOrderedMap failure = new SimpleOrderedMap(); failure.add("state", "notfound"); Index: solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java =================================================================== --- solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java (revision 1595026) +++ solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java (working copy) @@ -1345,12 +1345,14 @@ * Helper method to add a task to a tracking map. */ protected void addTask(String map, TaskObject o, boolean limit) { + synchronized (map) { if(limit && getMap(map).size() == MAX_TRACKED_REQUESTS) { String key = getMap(map).entrySet().iterator().next().getKey(); getMap(map).remove(key); } addTask(map, o); } + } protected void addTask(String map, TaskObject o) { Index: solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java =================================================================== --- solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java (revision 1595026) +++ solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java (working copy) @@ -405,6 +405,10 @@ ClientUtils.addSlices(target, collectionName, slices, multiCollection); } + public ShardHandlerFactory getShardHandlerFactory(){ + return httpShardHandlerFactory; + } + } Index: solr/core/src/java/org/apache/solr/handler/component/ShardHandler.java =================================================================== --- solr/core/src/java/org/apache/solr/handler/component/ShardHandler.java (revision 1595026) +++ solr/core/src/java/org/apache/solr/handler/component/ShardHandler.java (working copy) @@ -25,4 +25,5 @@ public abstract ShardResponse takeCompletedIncludingErrors(); public abstract ShardResponse takeCompletedOrError(); public abstract void cancelAll(); + public abstract ShardHandlerFactory getShardHandlerFactory(); } Index: solr/core/src/test/org/apache/solr/cloud/MultiThreadedOCPTest.java =================================================================== --- solr/core/src/test/org/apache/solr/cloud/MultiThreadedOCPTest.java (revision 0) +++ solr/core/src/test/org/apache/solr/cloud/MultiThreadedOCPTest.java (working copy) @@ -0,0 +1,236 @@ +package org.apache.solr.cloud; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.solr.client.solrj.SolrServer; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.HttpSolrServer; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.response.CollectionAdminResponse; +import org.apache.solr.common.params.CollectionParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.update.DirectUpdateHandler2; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Random; + +/** + * Tests the Multi threaded Collections API. + */ +public class MultiThreadedOCPTest extends AbstractFullDistribZkTestBase { + + private static Logger log = LoggerFactory + .getLogger(MultiThreadedOCPTest.class); + + private static final boolean DEBUG = false; + private static int NUM_COLLECTIONS = 4; + + @Before + @Override + public void setUp() throws Exception { + super.setUp(); + + useJettyDataDir = false; + + System.setProperty("numShards", Integer.toString(sliceCount)); + System.setProperty("solr.xml.persist", "true"); + } + + public MultiThreadedOCPTest() { + fixShardCount = true; + sliceCount = 2; + shardCount = 4; + } + + @Override + public void doTest() throws Exception { + + testParallelCollectionAPICalls(); + testTaskExclusivity(); + testLongAndShortRunningParallelApiCalls(); + + if (DEBUG) { + super.printLayout(); + } + } + + private void testParallelCollectionAPICalls() throws IOException, SolrServerException { + SolrServer server = createNewSolrServer("", getBaseUrl((HttpSolrServer) clients.get(0))); + + for(int i = 1 ; i <= NUM_COLLECTIONS ; i++) { + CollectionAdminRequest.createCollection("ocptest" + i, 4, "conf1", server, i + ""); + } + + boolean pass = false; + int counter = 0; + while(true) { + int numRunningTasks = 0; + for (int i = 1; i <= NUM_COLLECTIONS; i++) + if (getRequestState(i + "", server).equals("running")) + numRunningTasks++; + if(numRunningTasks > 1) { + pass = true; + break; + } else if(counter++ > 100) + break; + try { + Thread.sleep(100); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + assertTrue("More than one tasks were supposed to be running in parallel but they weren't.", pass); + for(int i=1;i<=NUM_COLLECTIONS;i++) { + String state = getRequestStateAfterCompletion(i + "", 30, server); + assertTrue("Task " + i + " did not complete, final state: " + state,state.equals("completed")); + } + } + + private void testTaskExclusivity() throws IOException, SolrServerException { + SolrServer server = createNewSolrServer("", getBaseUrl((HttpSolrServer) clients.get(0))); + CollectionAdminRequest.createCollection("ocptest_shardsplit", 4, "conf1", server, "1000"); + + CollectionAdminRequest.splitShard("ocptest_shardsplit", SHARD1, server, "1001"); + CollectionAdminRequest.splitShard("ocptest_shardsplit", SHARD2, server, "1002"); + + int iterations = 0; + while(true) { + int runningTasks = 0; + int completedTasks = 0; + for (int i=1001;i<=1002;i++) { + String state = getRequestState(i, server); + if (state.equals("running")) + runningTasks++; + if (state.equals("completed")) + completedTasks++; + assertTrue("We have a failed SPLITSHARD task", !state.equals("failed")); + } + // TODO: REQUESTSTATUS might come back with more than 1 running tasks over multiple calls. + // The only way to fix this is to support checking of multiple requestids in a single REQUESTSTATUS task. + + assertTrue("Mutual exclusion failed. Found more than one task running for the same collection", runningTasks < 2); + + if(completedTasks == 2 || iterations++ > 90) + break; + + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + + private void testLongAndShortRunningParallelApiCalls() throws InterruptedException, IOException, SolrServerException { + + Thread indexThread = new Thread() { + @Override + public void run() { + Random random = random(); + int max = atLeast(random, 200); + for (int id = 101; id < max; id++) { + try { + doAddDoc(String.valueOf(id)); + } catch (Exception e) { + log.error("Exception while adding docs", e); + } + } + } + }; + indexThread.start(); + + try { + Thread.sleep(5000); + + SolrServer server = createNewSolrServer("", getBaseUrl((HttpSolrServer) clients.get(0))); + CollectionAdminRequest.splitShard("collection1", SHARD1, server, "2000"); + + String state = getRequestState("2000", server); + while (!state.equals("running")) { + state = getRequestState("2000", server); + if (state.equals("completed") || state.equals("failed")) + break; + Thread.sleep(100); + } + assertTrue("SplitShard task [2000] was supposed to be in [running] but isn't. It is [" + state + "]", state.equals("running")); + + invokeCollectionApi("action", CollectionParams.CollectionAction.OVERSEERSTATUS.toLower()); + + state = getRequestState("2000", server); + + assertTrue("After invoking OVERSEERSTATUS, SplitShard task [2000] was still supposed to be in [running] but isn't." + + "It is [" + state + "]", state.equals("running")); + + } finally { + try { + indexThread.join(); + } catch (InterruptedException e) { + log.warn("Indexing thread interrupted."); + } + } + } + + void doAddDoc(String id) throws Exception { + index("id", id); + // todo - target diff servers and use cloud clients as well as non-cloud clients + } + + private String getRequestStateAfterCompletion(String requestId, int waitForSeconds, SolrServer server) + throws IOException, SolrServerException { + String state = null; + while(waitForSeconds-- > 0) { + state = getRequestState(requestId, server); + if(state.equals("completed") || state.equals("failed")) + return state; + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + } + } + return state; + } + + private String getRequestState(int requestId, SolrServer server) throws IOException, SolrServerException { + return getRequestState(String.valueOf(requestId), server); + } + + private String getRequestState(String requestId, SolrServer server) throws IOException, SolrServerException { + CollectionAdminResponse response = CollectionAdminRequest.requestStatus(requestId, server); + NamedList innerResponse = (NamedList) response.getResponse().get("status"); + return (String) innerResponse.get("state"); + } + + @Override + public void tearDown() throws Exception { + super.tearDown(); + System.clearProperty("numShards"); + System.clearProperty("zkHost"); + System.clearProperty("solr.xml.persist"); + + // insurance + DirectUpdateHandler2.commitOnClose = true; + } + +} + + + Property changes on: solr/core/src/test/org/apache/solr/cloud/MultiThreadedOCPTest.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: solr/core/src/test/org/apache/solr/cloud/OverseerCollectionProcessorTest.java =================================================================== --- solr/core/src/test/org/apache/solr/cloud/OverseerCollectionProcessorTest.java (revision 1595026) +++ solr/core/src/test/org/apache/solr/cloud/OverseerCollectionProcessorTest.java (working copy) @@ -31,12 +31,13 @@ import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.StrUtils; import org.apache.solr.handler.component.ShardHandler; +import org.apache.solr.handler.component.ShardHandlerFactory; import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.handler.component.ShardResponse; import org.apache.zookeeper.CreateMode; import org.easymock.Capture; +import org.easymock.EasyMock; import org.easymock.IAnswer; -import org.eclipse.jetty.util.BlockingArrayQueue; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; @@ -44,14 +45,17 @@ import org.junit.Test; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Queue; import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; import static org.easymock.EasyMock.anyBoolean; import static org.easymock.EasyMock.anyObject; @@ -74,39 +78,35 @@ private static DistributedMap runningMapMock; private static DistributedMap completedMapMock; private static DistributedMap failureMapMock; + private static ShardHandlerFactory shardHandlerFactoryMock; private static ShardHandler shardHandlerMock; private static ZkStateReader zkStateReaderMock; private static ClusterState clusterStateMock; private static SolrZkClient solrZkClientMock; private final Map zkMap = new HashMap(); private final Set collectionsSet = new HashSet(); + private SolrResponse lastProcessMessageResult; + private OverseerCollectionProcessorToBeTested underTest; private Thread thread; - private Queue queue = new BlockingArrayQueue<>(); + private Queue queue = new ArrayBlockingQueue<>(10); private class OverseerCollectionProcessorToBeTested extends OverseerCollectionProcessor { - private SolrResponse lastProcessMessageResult; - + public OverseerCollectionProcessorToBeTested(ZkStateReader zkStateReader, - String myId, ShardHandler shardHandler, String adminPath, + String myId, ShardHandlerFactory shardHandlerFactory, + String adminPath, DistributedQueue workQueue, DistributedMap runningMap, DistributedMap completedMap, DistributedMap failureMap) { - super(zkStateReader, myId, shardHandler, adminPath, new Overseer.Stats(), workQueue, runningMap, completedMap, failureMap); + super(zkStateReader, myId, shardHandlerFactory, adminPath, new Overseer.Stats(), workQueue, runningMap, completedMap, failureMap); } @Override - protected SolrResponse processMessage(ZkNodeProps message, String operation) { - lastProcessMessageResult = super.processMessage(message, operation); - log.info("1 : "+System.currentTimeMillis()); - return lastProcessMessageResult; - } - - @Override protected LeaderStatus amILeader() { return LeaderStatus.YES; } @@ -119,10 +119,12 @@ runningMapMock = createMock(DistributedMap.class); completedMapMock = createMock(DistributedMap.class); failureMapMock = createMock(DistributedMap.class); + shardHandlerFactoryMock = createMock(ShardHandlerFactory.class); shardHandlerMock = createMock(ShardHandler.class); zkStateReaderMock = createMock(ZkStateReader.class); clusterStateMock = createMock(ClusterState.class); solrZkClientMock = createMock(SolrZkClient.class); + } @AfterClass @@ -131,6 +133,7 @@ runningMapMock = null; completedMapMock = null; failureMapMock = null; + shardHandlerFactoryMock = null; shardHandlerMock = null; zkStateReaderMock = null; clusterStateMock = null; @@ -145,12 +148,13 @@ reset(runningMapMock); reset(completedMapMock); reset(failureMapMock); + reset(shardHandlerFactoryMock); reset(shardHandlerMock); reset(zkStateReaderMock); reset(clusterStateMock); reset(solrZkClientMock); underTest = new OverseerCollectionProcessorToBeTested(zkStateReaderMock, - "1234", shardHandlerMock, ADMIN_PATH, workQueueMock, runningMapMock, + "1234", shardHandlerFactoryMock, ADMIN_PATH, workQueueMock, runningMapMock, completedMapMock, failureMapMock); zkMap.clear(); collectionsSet.clear(); @@ -163,6 +167,44 @@ } protected Set commonMocks(int liveNodesCount) throws Exception { + + shardHandlerFactoryMock.getShardHandler(); + expectLastCall().andAnswer(new IAnswer() { + @Override + public ShardHandler answer() throws Throwable { + log.info("SHARDHANDLER"); + return shardHandlerMock; + } + }).anyTimes(); + workQueueMock.peekTopN(EasyMock.anyInt(), anyObject(Set.class), EasyMock.anyLong()); + expectLastCall().andAnswer(new IAnswer() { + @Override + public List answer() throws Throwable { + Object result; + int count = 0; + while ((result = queue.peek()) == null) { + Thread.sleep(1000); + count++; + if (count > 1) return null; + } + + return Arrays.asList(result); + } + }).anyTimes(); + + workQueueMock.getTailId(); + expectLastCall().andAnswer(new IAnswer() { + @Override + public Object answer() throws Throwable { + Object result = null; + Iterator iter = queue.iterator(); + while(iter.hasNext()) { + result = iter.next(); + } + return result==null ? null : ((QueueEvent)result).getId(); + } + }).anyTimes(); + workQueueMock.peek(true); expectLastCall().andAnswer(new IAnswer() { @Override @@ -378,7 +420,12 @@ OverseerCollectionProcessor.MAX_SHARDS_PER_NODE, maxShardsPerNode.toString()); } - QueueEvent qe = new QueueEvent("id", ZkStateReader.toJSON(props), null); + QueueEvent qe = new QueueEvent("id", ZkStateReader.toJSON(props), null){ + @Override + public void setBytes(byte[] bytes) { + lastProcessMessageResult = SolrResponse.deserialize( bytes); + } + }; queue.add(qe); } @@ -542,20 +589,23 @@ replay(solrZkClientMock); replay(zkStateReaderMock); replay(clusterStateMock); + replay(shardHandlerFactoryMock); replay(shardHandlerMock); - log.info("clusterstate " +clusterStateMock.hashCode()); + log.info("clusterstate " + clusterStateMock.hashCode()); + startComponentUnderTest(); - issueCreateJob(numberOfSlices, replicationFactor, maxShardsPerNode, (createNodeListOption != CreateNodeListOptions.SEND_NULL)?createNodeList:null, (createNodeListOption != CreateNodeListOptions.DONT_SEND)); + issueCreateJob(numberOfSlices, replicationFactor, maxShardsPerNode, (createNodeListOption != CreateNodeListOptions.SEND_NULL) ? createNodeList : null, (createNodeListOption != CreateNodeListOptions.DONT_SEND)); waitForEmptyQueue(10000); if (collectionExceptedToBeCreated) { - assertNotNull(underTest.lastProcessMessageResult.getResponse().toString(), underTest.lastProcessMessageResult); + assertNotNull(lastProcessMessageResult.getResponse().toString(), lastProcessMessageResult); } + verify(shardHandlerFactoryMock); verify(shardHandlerMock); - + if (collectionExceptedToBeCreated) { verifySubmitCaptures(submitCaptures, numberOfSlices, replicationFactor, createNodeList); Index: solr/core/src/test/org/apache/solr/cloud/OverseerStatusTest.java =================================================================== --- solr/core/src/test/org/apache/solr/cloud/OverseerStatusTest.java (revision 1595026) +++ solr/core/src/test/org/apache/solr/cloud/OverseerStatusTest.java (working copy) @@ -123,22 +123,4 @@ waitForThingsToLevelOut(15); } - - private NamedList invokeCollectionApi(String... args) throws SolrServerException, IOException { - ModifiableSolrParams params = new ModifiableSolrParams(); - SolrRequest request = new QueryRequest(params); - for (int i = 0; i < args.length - 1; i+=2) { - params.add(args[i], args[i+1]); - } - request.setPath("/admin/collections"); - - String baseUrl = ((HttpSolrServer) shardToJetty.get(SHARD1).get(0).client.solrClient) - .getBaseURL(); - baseUrl = baseUrl.substring(0, baseUrl.length() - "collection1".length()); - - HttpSolrServer baseServer = new HttpSolrServer(baseUrl); - baseServer.setConnectionTimeout(15000); - baseServer.setSoTimeout(60000 * 5); - return baseServer.request(request); - } } Index: solr/core/src/test/org/apache/solr/core/MockShardHandlerFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/core/MockShardHandlerFactory.java (revision 1595026) +++ solr/core/src/test/org/apache/solr/core/MockShardHandlerFactory.java (working copy) @@ -57,6 +57,11 @@ @Override public void cancelAll() {} + + @Override + public ShardHandlerFactory getShardHandlerFactory() { + return MockShardHandlerFactory.this; + } }; } Index: solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java =================================================================== --- solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java (revision 1595026) +++ solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java (working copy) @@ -411,6 +411,21 @@ return req.process( server ); } + public static CollectionAdminResponse createCollection(String name, + Integer shards, String conf, + Integer maxShards, + SolrServer server, + String asyncId) throws SolrServerException, IOException { + Create req = new Create(); + req.setCollectionName(name); + req.setRouterName("compositeId"); + req.setNumShards(shards); + req.setConfigName(conf); + req.setAsyncId(asyncId); + req.setMaxShardsPerNode(maxShards); + return req.process(server); + } + public static CollectionAdminResponse reloadCollection( String name, SolrServer server) throws SolrServerException, IOException { return reloadCollection(name, server, null); Index: solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java =================================================================== --- solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java (revision 1595026) +++ solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java (working copy) @@ -90,6 +90,7 @@ public static final String LEGACY_CLOUD = "legacyCloud"; public static final String URL_SCHEME = "urlScheme"; + public static final String MAX_COLL_PROCESSOR_THREADS = "maxCollectionProcessorThreads"; private volatile ClusterState clusterState; Index: solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java =================================================================== --- solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java (revision 1595026) +++ solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java (working copy) @@ -68,6 +68,7 @@ import org.apache.solr.common.params.CollectionParams.CollectionAction; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.StrUtils; import org.apache.solr.core.CoreContainer; import org.apache.solr.core.SolrCore; @@ -1719,7 +1720,26 @@ } } + protected NamedList invokeCollectionApi(String... args) throws SolrServerException, IOException { + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrRequest request = new QueryRequest(params); + for (int i = 0; i < args.length - 1; i+=2) { + params.add(args[i], args[i+1]); + } + request.setPath("/admin/collections"); + String baseUrl = ((HttpSolrServer) shardToJetty.get(SHARD1).get(0).client.solrClient) + .getBaseURL(); + baseUrl = baseUrl.substring(0, baseUrl.length() - "collection1".length()); + + HttpSolrServer baseServer = new HttpSolrServer(baseUrl); + baseServer.setConnectionTimeout(15000); + baseServer.setSoTimeout(60000 * 5); + NamedList r = baseServer.request(request); + baseServer.shutdown(); + return r; + } + protected void createCollection(String collName, CloudSolrServer client, int replicationFactor ,