1 package search;
2 
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.util.HashSet;
6 import java.util.List;
7 import java.util.Set;
8 
9 import resource.ResourceLocator;
10import util.MolImportUtil;
11import chemaxon.formats.MolExporter;
12import chemaxon.formats.MolFormatException;
13import chemaxon.sss.SearchConstants;
14import chemaxon.sss.screen.HashCode;
15import chemaxon.sss.search.MolSearch;
16import chemaxon.sss.search.MolSearchOptions;
17import chemaxon.sss.search.SearchException;
18import chemaxon.sss.search.StandardizedMolSearch;
19import chemaxon.struc.Molecule;
20
21/**
22 * Running various types of duplicate search:
23 * <ul>
24 * <li>comparing every pair of molecules</li>
25 * <li>comparing smiles format of molecules</li>
26 * <li>comparing based on hash-code comparison</li>
27 * </ul>
28 * 
29 * @author JChem Base team, ChemAxon Ltd.
30 */
31public final class DuplicateSearchExample {
32
33    /**
34     * Imports molecules from the default input file (1000 structures from NCI data set) and
35     * carries out three solution methods of duplicate search on it.
36     */
37    public static void main(String[] args) {
38        try {
39            new DuplicateSearchExample().run();
40        } catch (SearchException e) {
41            System.out.println("Error during duplicate searching.");
42            e.printStackTrace();
43        } catch (MolFormatException e) {
44            System.out.println("Bad structures in input file.");
45            e.printStackTrace();
46        } catch (FileNotFoundException e) {
47            System.out.println("Input file couldn't be found");
48            e.printStackTrace();
49        } catch (IOException e) {
50            System.out.println("I/O error during molecule import.");
51            e.printStackTrace();
52        }
53    }
54
55    private void run() throws MolFormatException, FileNotFoundException, IOException,
56            SearchException {
57
58        System.out.println("Reading molecules.");
59        String path = ResourceLocator.getDefaultInputPath();
60        List<Molecule> mols = MolImportUtil.moleculeListImport(path);
61        for (int i = 0; i < mols.size(); i++) {
62            mols.get(i).aromatize();    // aromatization is needed for search!
63        }
64
65        // Various duplicate search methods
66        searchForDuplicates(mols);
67        searchForDuplicatesUniqueSmiles(mols);
68        searchForDuplicatesHash(mols);
69    }
70
71    /**
72     * Performs duplicate search with {@link MolSearch} to compare every pairs of molecules.
73     * 
74     * @param mols molecules to search
75     * @throws SearchException if error occurs during duplicate searching
76     */
77    private void searchForDuplicates(List<Molecule> mols) throws SearchException {
78
79        MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.DUPLICATE);
80        StandardizedMolSearch searcher = new StandardizedMolSearch();
81        searcher.setSearchOptions(searchOptions);
82
83        long start = System.currentTimeMillis();
84        System.out.println();
85        System.out.println("Searching for duplicates.");
86        System.out.println("\tMatching IDs");
87
88        int num = 0;
89        for (int q = 0; q < mols.size(); q++) {
90            searcher.setQuery(mols.get(q));
91            for (int t = 0; t < q; t++) {
92                searcher.setTarget(mols.get(t));
93                if (searcher.isMatching()) {
94                    System.out.printf("\t%d is duplicate of %d\n", q + 1, t + 1);
95                    num++;
96                    break;
97                }
98            }
99        }
00        System.out.printf("Found %d duplicates in %d milliseconds\n", num,
01                System.currentTimeMillis() - start);
02    }
03
04    /**
05     * Searches for duplicates based on comparison of the molecules' unique SMILES
06     * representation.
07     * 
08     * @param mols molecules to search
09     * @throws IOException if error occurs during unique SMILES conversion
10     */
11    private void searchForDuplicatesUniqueSmiles(List<Molecule> mols) throws IOException {
12
13        long start = System.currentTimeMillis();
14
15        System.out.println();
16        System.out.println("Searching for duplicates based on "
17                + "unique SMILES string comparison.");
18        System.out.println("\tMatching IDs");
19
20        Set<String> smilesSet = new HashSet<String>();
21        int num = 0;
22        for (int i = 0; i < mols.size(); i++) {
23            // Create unique SMILES representation
24            String smiles = MolExporter.exportToFormat(mols.get(i), "smiles:u");
25            // Check if the same unique SMILES has already been found
26            if (!smilesSet.contains(smiles)) {
27                smilesSet.add(smiles);
28            } else {
29                // Duplicate found: structure is already contained
30                System.out.println("\t" + (i + 1) + " is duplicate.");
31                num++;
32            }
33        }
34        System.out.printf("Found %d duplicates in %d milliseconds\n", num,
35                System.currentTimeMillis() - start);
36    }
37
38    /**
39     * Searches for duplicates based on the comparison of the molecules' hash code. The
40     * equivalence of the hash codes doesn't imply a structural equivalence, so molecules with
41     * similar hash code should still be matched in structure.
42     * 
43     * @param mols molecules to search
44     * @throws SearchException if error occurs during duplicate searching
45     */
46    private void searchForDuplicatesHash(List<Molecule> mols) throws SearchException {
47
48        StandardizedMolSearch searcher = new StandardizedMolSearch();
49        long start = System.currentTimeMillis();
50        HashCode hc = new HashCode();
51
52        // Generate hash codes
53        int[] codes = new int[mols.size()];
54        for (int i = 0; i < mols.size(); i++) {
55            codes[i] = hc.getHashCode(mols.get(i));
56        }
57
58        System.out.println("\nSearching for duplicates based on "
59                + "hash code comparison and subsequent searching");
60        System.out.println("\tMatching IDs");
61        int num = 0;
62        for (int q = 0; q < mols.size(); q++) {
63            for (int t = 0; t < q; t++) {
64                if (codes[q] == codes[t]) {
65                    // If hash-codes are equal, check with MolSearch
66                    searcher.setQuery(mols.get(q));
67                    searcher.setTarget(mols.get(t));
68                    if (searcher.isMatching()) {
69                        System.out.printf("\t%d is duplicate of %d\n", q + 1, t + 1);
70                        num++;
71                        break;
72                    }
73                }
74            }
75        }
76        System.out.printf("Found %d duplicates in %d milliseconds\n", num,
77                System.currentTimeMillis() - start);
78    }
79
80}
81