Uploaded image for project: 'Ignite'
  1. Ignite
  2. IGNITE-12774

Transaction hangs after too many open files NIO exception

    XMLWordPrintableJSON

    Details

    • Type: Bug
    • Status: Resolved
    • Priority: Major
    • Resolution: Fixed
    • Affects Version/s: None
    • Fix Version/s: 2.9, 2.8.1
    • Component/s: None
    • Labels:
      None
    • Release Note:
      Fix transaction hanging after too many open files exception.
    • Ignite Flags:
      Release Notes Required

      Description

      Transaction hung after “Open too many files” error and never been finished.

      import java.net.SocketException;
      import java.util.concurrent.atomic.AtomicBoolean;
      import org.apache.ignite.cluster.ClusterNode;
      import org.apache.ignite.configuration.CacheConfiguration;
      import org.apache.ignite.configuration.IgniteConfiguration;
      import org.apache.ignite.failure.StopNodeOrHaltFailureHandler;
      import org.apache.ignite.internal.IgniteEx;
      import org.apache.ignite.lang.IgniteInClosure;
      import org.apache.ignite.plugin.extensions.communication.Message;
      import org.apache.ignite.spi.IgniteSpiException;
      import org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi;
      import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
      import org.apache.ignite.transactions.Transaction;
      import org.apache.ignite.transactions.TransactionConcurrency;
      import org.apache.ignite.transactions.TransactionIsolation;
      
      import static org.apache.ignite.cache.CacheAtomicityMode.TRANSACTIONAL;
      import static org.apache.ignite.cache.CacheMode.PARTITIONED;
      
      public class TooManyOpenFilesTest extends GridCommonAbstractTest {
          @Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception {
              return super.getConfiguration(igniteInstanceName)
                  .setFailureHandler(new StopNodeOrHaltFailureHandler())
                  .setCommunicationSpi(new TooManyOpenFilesTcpCommunicationSpi())
                  .setConsistentId(igniteInstanceName);
          }
      
          @Override protected void beforeTest() throws Exception {
              super.beforeTest();
      
              stopAllGrids();
      
              cleanPersistenceDir();
          }
      
          @Override protected void afterTest() throws Exception {
              stopAllGrids();
      
              cleanPersistenceDir();
      
              super.afterTest();
          }
      
          public void test() throws Exception {
              IgniteEx crd = startGrids(3);
      
              crd.cluster().active(true);
      
              crd.getOrCreateCache(new CacheConfiguration<>().setName(DEFAULT_CACHE_NAME).setAtomicityMode(TRANSACTIONAL).setBackups(1).setCacheMode(PARTITIONED));
      
              TooManyOpenFilesTcpCommunicationSpi spi = (TooManyOpenFilesTcpCommunicationSpi)grid(2).context().config().getCommunicationSpi();
      
              try (Transaction tx = grid(1).transactions().txStart(TransactionConcurrency.PESSIMISTIC, TransactionIsolation.REPEATABLE_READ)) {
                  IgniteCache<Object, Object> cache = grid(1).cache(DEFAULT_CACHE_NAME);
      
                  cache.put(1, 1);
      
                  spi.throwException.set(true);
      
                  cache.put(2, 2);
                  cache.put(3, 2);
                  cache.put(4, 2);
      
                  // hungs here.
                  tx.commit();
              }
      
              for (int i=0; i < 3 ; i++) {
                  assertEquals(1, grid(i).cache(DEFAULT_CACHE_NAME).get(1));
                  assertEquals(2, grid(i).cache(DEFAULT_CACHE_NAME).get(2));
              }
          }
      
      
          private static class TooManyOpenFilesTcpCommunicationSpi extends TcpCommunicationSpi {
              private final AtomicBoolean throwException = new AtomicBoolean();
      
              /** {@inheritDoc} */
              @Override public void sendMessage(ClusterNode node, Message msg) throws IgniteSpiException {
                  if (throwException.get())
                      throw getException(node);
      
                  super.sendMessage(node, msg);
              }
      
              /** {@inheritDoc} */
              @Override public void sendMessage(
                  ClusterNode node,
                  Message msg,
                  IgniteInClosure<IgniteException> ackC
              ) throws IgniteSpiException {
                  if (throwException.get())
                      throw getException(node);
      
                  super.sendMessage(node, msg, ackC);
              }
      
              private IgniteSpiException getException(ClusterNode node) {
                  String checkedExceptionMsg =  "Failed to connect to node (is node still alive?). " +
                      "Make sure that each ComputeTask and cache Transaction has a timeout set " +
                      "in order to prevent parties from waiting forever in case of network issues " +
                      "[nodeId=" + node.id() + ", addrs=null]";
      
                  return new IgniteSpiException("Failed to send message to remote node: " + node.id(), new IgniteCheckedException(checkedExceptionMsg, new SocketException("Too many open files")));
              }
          }
      }
      

        Attachments

          Issue Links

            Activity

              People

              • Assignee:
                antonovsergey93 Sergey Antonov
                Reporter:
                antonovsergey93 Sergey Antonov
              • Votes:
                0 Vote for this issue
                Watchers:
                3 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved:

                  Time Tracking

                  Estimated:
                  Original Estimate - Not Specified
                  Not Specified
                  Remaining:
                  Remaining Estimate - 0h
                  0h
                  Logged:
                  Time Spent - 0.5h
                  0.5h