1 package search;
2
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.util.HashSet;
6 import java.util.List;
7 import java.util.Set;
8
9 import resource.ResourceLocator;
10import util.MolImportUtil;
11import chemaxon.formats.MolExporter;
12import chemaxon.formats.MolFormatException;
13import chemaxon.sss.SearchConstants;
14import chemaxon.sss.screen.HashCode;
15import chemaxon.sss.search.MolSearch;
16import chemaxon.sss.search.MolSearchOptions;
17import chemaxon.sss.search.SearchException;
18import chemaxon.sss.search.StandardizedMolSearch;
19import chemaxon.struc.Molecule;
20
21
31public final class DuplicateSearchExample {
32
33
37 public static void main(String[] args) {
38 try {
39 new DuplicateSearchExample().run();
40 } catch (SearchException e) {
41 System.out.println("Error during duplicate searching.");
42 e.printStackTrace();
43 } catch (MolFormatException e) {
44 System.out.println("Bad structures in input file.");
45 e.printStackTrace();
46 } catch (FileNotFoundException e) {
47 System.out.println("Input file couldn't be found");
48 e.printStackTrace();
49 } catch (IOException e) {
50 System.out.println("I/O error during molecule import.");
51 e.printStackTrace();
52 }
53 }
54
55 private void run() throws MolFormatException, FileNotFoundException, IOException,
56 SearchException {
57
58 System.out.println("Reading molecules.");
59 String path = ResourceLocator.getDefaultInputPath();
60 List<Molecule> mols = MolImportUtil.moleculeListImport(path);
61 for (int i = 0; i < mols.size(); i++) {
62 mols.get(i).aromatize(); }
64
65 searchForDuplicates(mols);
67 searchForDuplicatesUniqueSmiles(mols);
68 searchForDuplicatesHash(mols);
69 }
70
71
77 private void searchForDuplicates(List<Molecule> mols) throws SearchException {
78
79 MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.DUPLICATE);
80 StandardizedMolSearch searcher = new StandardizedMolSearch();
81 searcher.setSearchOptions(searchOptions);
82
83 long start = System.currentTimeMillis();
84 System.out.println();
85 System.out.println("Searching for duplicates.");
86 System.out.println("\tMatching IDs");
87
88 int num = 0;
89 for (int q = 0; q < mols.size(); q++) {
90 searcher.setQuery(mols.get(q));
91 for (int t = 0; t < q; t++) {
92 searcher.setTarget(mols.get(t));
93 if (searcher.isMatching()) {
94 System.out.printf("\t%d is duplicate of %d\n", q + 1, t + 1);
95 num++;
96 break;
97 }
98 }
99 }
00 System.out.printf("Found %d duplicates in %d milliseconds\n", num,
01 System.currentTimeMillis() - start);
02 }
03
04
11 private void searchForDuplicatesUniqueSmiles(List<Molecule> mols) throws IOException {
12
13 long start = System.currentTimeMillis();
14
15 System.out.println();
16 System.out.println("Searching for duplicates based on "
17 + "unique SMILES string comparison.");
18 System.out.println("\tMatching IDs");
19
20 Set<String> smilesSet = new HashSet<String>();
21 int num = 0;
22 for (int i = 0; i < mols.size(); i++) {
23 String smiles = MolExporter.exportToFormat(mols.get(i), "smiles:u");
25 if (!smilesSet.contains(smiles)) {
27 smilesSet.add(smiles);
28 } else {
29 System.out.println("\t" + (i + 1) + " is duplicate.");
31 num++;
32 }
33 }
34 System.out.printf("Found %d duplicates in %d milliseconds\n", num,
35 System.currentTimeMillis() - start);
36 }
37
38
46 private void searchForDuplicatesHash(List<Molecule> mols) throws SearchException {
47
48 StandardizedMolSearch searcher = new StandardizedMolSearch();
49 long start = System.currentTimeMillis();
50 HashCode hc = new HashCode();
51
52 int[] codes = new int[mols.size()];
54 for (int i = 0; i < mols.size(); i++) {
55 codes[i] = hc.getHashCode(mols.get(i));
56 }
57
58 System.out.println("\nSearching for duplicates based on "
59 + "hash code comparison and subsequent searching");
60 System.out.println("\tMatching IDs");
61 int num = 0;
62 for (int q = 0; q < mols.size(); q++) {
63 for (int t = 0; t < q; t++) {
64 if (codes[q] == codes[t]) {
65 searcher.setQuery(mols.get(q));
67 searcher.setTarget(mols.get(t));
68 if (searcher.isMatching()) {
69 System.out.printf("\t%d is duplicate of %d\n", q + 1, t + 1);
70 num++;
71 break;
72 }
73 }
74 }
75 }
76 System.out.printf("Found %d duplicates in %d milliseconds\n", num,
77 System.currentTimeMillis() - start);
78 }
79
80}
81