PDF Trawler
/**
* PDF file loader.
* Trawls a directory and all its subdirectories, and looks for PDF files.
* Extracts the individual molecules from the file and loads them into the
* structure entity at the root of the data tree.
*
* Usage:
* 1. create a structure entity in the project explorer
* 2. Add a fields named 'Filename' and 'Name'
* 3. Edit the settings in the 'adjust these variables' section (the defaults are for the
* Pubchem demo data tree in the sample project)
* 4. Run the script
*
* @author Tim Dudgeon
*/
import groovy.io.FileType
import chemaxon.formats.MolImporter
import com.im.commons.progress.*
import com.im.df.api.chem.MarvinStructure
// --------- adjust these variables --------------
def pattern = ~/.*\\.pdf/ // pattern for file to process
def root = new File('C:/Documents/chemaxon/pdfs') // dir to start at
def STRUCTURE_FIELD = 'Structure' // name of structure field
def FILE_FIELD = 'Filename' // name of file field
def NAME_FIELD = 'Name' // name of the name field
// ---------- end of variables -------------------
def structF
def filenameF
def nameF
def edp
def traverse
// ---------- this is the routine that process the file and loads it
def perform = { file, envRW ->
println "processing file $file"
MolImporter importer = new MolImporter(file, "pdf")
def mol = null
int count = 0
while (mol = importer.read()) {
count++
println "loading $count $mol"
def vals = [ (structF.id) : new MarvinStructure(mol), (filenameF.id) : file.path, (nameF.id) : mol.name ]
edp.insert(vals, null, envRW)
}
}
def ety = dataTree.rootVertex.entity
edp = ety.schema.dataProvider.getEntityDataProvider(ety)
structF = ety.fields.items.find { it.name == STRUCTURE_FIELD }
filenameF = ety.fields.items.find { it.name == FILE_FIELD }
nameF = ety.fields.items.find { it.name == NAME_FIELD }
println "Found fields ${structF.id} and ${filenameF.id}"
traverse = { dir ->
println "Looking at dir $dir"
dir.eachFileMatch(FileType.FILES, pattern) { file ->
// stop if the script is terminated
if (env.getFeedback().isCancelled()) {
def msg = "Importing molecules from $root interupted!"
println msg
throw new InterruptedException(msg)
}
edp.lockable.withLock('loading') { envRW ->
perform(file, envRW)
}
}
dir.eachDir(traverse)
}
// start the process off
traverse(root)