[UIMA-4049] The curious case of the zombie annotation - ASF JIRA

Voters

Watch issue

Watchers

Link

Clone

Update Comment Author

Replace String in Comment

Update Comment Visibility

Delete Comments

XML

Word

Printable

JSON

Details

Type: Bug
Status: Closed
Priority: Major
Resolution: Done
Affects Version/s: None
Fix Version/s: 2.7.0SDK
Component/s: Core Java Framework
Labels:
None

Description

When annotations are removed from indexes, sometimes they come back... the following test case shows how an annotation is removed but still present when iterating over the index later.

    @Test
    public void testForZombies() throws Exception
    {
        // No zombie here
        int[] offsets1 = { 0, 4, 5, 11, 12, 21, 22, 25, 26, 29, 30, 35, 36, 40, 41, 50, 51, 60, 61,
                64, 64, 65 };
        testForZombies("Dies flößte Friedrich II. für seine neue Eroberung Besorgnis ein.", offsets1);
        
        // Zombie hiding in here
        int[] offsets2 = { 0, 3, 4, 7, 8, 13, 14, 18, 19, 22, 23, 33, 34, 35 };
        testForZombies("Ich bin Franz III. von Hammerfels !", offsets2);
    }

    public void testForZombies(String aText, int[] aOffsets) throws Exception
    {
        // Init some dictionaries we ues
        Set<String> names = new HashSet<String>();
        names.add("Friedrich");
        names.add("Franz");

        Set<String> suffix = new HashSet<String>();
        suffix.add("II.");
        suffix.add("III.");

        // Set up type system
        TypeSystemDescription tsd = new TypeSystemDescription_impl();
        tsd.addType("Token", "", CAS.TYPE_NAME_ANNOTATION);
        
        // Create CAS
        CAS jcas = CasCreationUtils.createCas(tsd, null, null);
        jcas.setDocumentText(aText);
        
        Type tokenType = jcas.getTypeSystem().getType("Token");
        Feature beginFeature = tokenType.getFeatureByBaseName("begin");
        
        // Create tokens in CAS
        for (int i = 0; i < aOffsets.length; i += 2) {
            jcas.addFsToIndexes(jcas.createAnnotation(tokenType, aOffsets[i], aOffsets[i+1]));
        }
        
        // List the tokens in the CAS
        for (AnnotationFS token : jcas.getAnnotationIndex(tokenType)) {
            System.out.printf("Starting with %s%n", token.getCoveredText());
        }

        // Merge some tokens, in particular "Franz" "III." -> "Franz III." and "Friedrich" "II."
        // into "Friedrich II."
        AnnotationFS previous = null;
        List<AnnotationFS> toDelete = new ArrayList<>();
        for (AnnotationFS token : jcas.getAnnotationIndex(tokenType)) {
            if (previous != null && names.contains(previous.getCoveredText())
                    && suffix.contains(token.getCoveredText())) {
                token.setIntValue(beginFeature, previous.getBegin());
                toDelete.add(previous);
            }
            previous = token;
        }

        // Remove the no longer necessary tokens ("Friedrich" and "Franz"), since we expanded the
        // following tokens "III." and "II." to include their text
        Set<String> removedWords = new HashSet<String>();
        for (AnnotationFS token : toDelete) {
            System.out.printf("Removing %s%n", token.getCoveredText());
            removedWords.add(token.getCoveredText());
            jcas.removeFsFromIndexes(token);
        }

        // Check if the tokens that we wanted to remove are really gone
        for (AnnotationFS token : jcas.getAnnotationIndex(tokenType)) {
            System.out.printf("Remaining %s%n", token.getCoveredText());
            if (removedWords.contains(token.getCoveredText())) {
               org.junit.Assert.fail("I saw a zombie!!!");
            }
        }
    }