Menu

[r3257]: / jTessBoxEditor / trunk / tools / train.ps1  Maximize  Restore  History

Download this file

136 lines (107 with data), 5.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
<#
Automate Tesseract 3.02 language data pack generation process.
@author: Quan Nguyen
@date: 16 April 2013
The script file should be placed in the same directory as Tesseract's binary executables.
All training data files must be prefixed with the language code -- such as:
vie.arial.exp0.tif, vie.font_properties, vie.unicharambigs, vie.frequent_words_list, vie.words_list
-- and placed in a trainfolder directory, which could be placed directly under Tesseract directory.
http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3
Run PowerShell as Administrator and allow script execution by running the following command:
PS > Set-ExecutionPolicy RemoteSigned
Then execute the script by:
PS > .\train.ps1
or
PS > .\train.ps1 yourlang trainfolder
Windows PowerShell 2.0 Download: http://support.microsoft.com/kb/968929
#>
if ($args[0] -and ($args[0] -eq "-?" -or $args[0] -eq "-h" -or $args[0] -eq "-help")) {
Write-Host "Usage: .\train.ps1"
Write-Host " or .\train.ps1 trainfolder yourlang [bootstraplang]"
Write-Host "where trainfolder directory contains all the training data files prefixed with yourlang, e.g.,"
Write-Host "vie.arial.exp0.tif, vie.font_properties, vie.unicharambigs, vie.frequent_words_list, vie.words_list,"
Write-Host "and could be placed directly under Tesseract directory"
exit
}
$trainDir = $args[0]
if (!$trainDir) {
$trainDir = Read-Host "Enter location of the training data folder"
}
$lang = $args[1]
if (!$lang) {
$lang = Read-Host "Enter a language code"
}
if ($lang -eq "" -or $trainDir -eq "") {
Write-Host "Invalid input"
exit
}
if (!(test-path $trainDir)) {
throw "{0} is not a valid path" -f $trainDir
exit
}
$bootstraplang = $args[2]
if (!$bootstraplang) {
$bootstraplang = Read-Host "Enter a bootstrap language code (optional)"
}
echo "=== Generating Tesseract language data for language: $lang ==="
$fullPath = Resolve-Path $trainDir
echo "** Your training images should be in ""$fullPath"" directory."
$al = New-Object System.Collections.ArrayList
echo "Make Box Files"
$boxFiles = ""
Foreach ($entry in dir $trainDir) {
If ($entry.name.toLower().endsWith(".tif") -and $entry.name.startsWith($lang)) {
echo "** Processing image: $entry"
$nameWoExt = [IO.Path]::Combine($trainDir, $entry.BaseName)
$al.Add($nameWoExt)
If ($bootstraplang -eq "") {
$trainCmd = ".\tesseract {0}.tif {0} batch.nochop makebox" -f $nameWoExt
} else {
#Bootstrapping a new character set
$trainCmd = ".\tesseract {0}.tif {0} -l {1} batch.nochop makebox" -f $nameWoExt, $bootstraplang
}
#Should comment out the next line after done with editing the box files to prevent them from getting overwritten in repeated runs.
Invoke-Expression $trainCmd
$boxFiles += $nameWoExt + ".box "
}
}
echo "** Box files should be edited before continuing. **"
echo "Generate .tr Files"
$trFiles = ""
Foreach ($entry in $al) {
$trainCmd = ".\tesseract {0}.tif {0} box.train" -f $entry
Invoke-Expression $trainCmd
$trFiles += $entry + ".tr "
}
echo "Compute the Character Set"
Invoke-Expression ".\unicharset_extractor -D $trainDir $boxFiles"
echo "set_unicharset_properties"
Invoke-Expression ".\set_unicharset_properties -U unicharset -O unicharset --script_dir=$trainDir";
echo "Clustering"
Invoke-Expression ".\shapeclustering -F $trainDir\$lang.font_properties -U $trainDir\unicharset $trFiles"
Invoke-Expression ".\mftraining -F $trainDir\$lang.font_properties -U $trainDir\unicharset -O $trainDir\$lang.unicharset $trFiles"
move-item -force -path inttemp -destination $trainDir\$lang.inttemp
move-item -force -path pffmtable -destination $trainDir\$lang.pffmtable
#move-item -force -path Microfeat -destination $trainDir\$lang.Microfeat
Invoke-Expression ".\cntraining $trFiles"
move-item -force -path normproto -destination $trainDir\$lang.normproto
move-item -force -path shapetable -destination $trainDir\$lang.shapetable
echo "Dictionary Data"
Invoke-Expression ".\wordlist2dawg $trainDir\$lang.frequent_words_list $trainDir\$lang.freq-dawg $trainDir\$lang.unicharset"
Invoke-Expression ".\wordlist2dawg $trainDir\$lang.words_list $trainDir\$lang.word-dawg $trainDir\$lang.unicharset"
if (test-path $trainDir\$lang.punc) {
Invoke-Expression ".\wordlist2dawg $trainDir\$lang.punc $trainDir\$lang.punc-dawg $trainDir\$lang.unicharset"
}
if (test-path $trainDir\$lang.numbers) {
Invoke-Expression ".\wordlist2dawg $trainDir\$lang.numbers $trainDir\$lang.number-dawg $trainDir\$lang.unicharset"
}
if (test-path $trainDir\$lang.word.bigrams) {
Invoke-Expression ".\wordlist2dawg $trainDir\$lang.word.bigrams $trainDir\$lang.bigram-dawg $trainDir\$lang.unicharset"
}
echo "The last file (unicharambigs) -- this is to be manually edited"
if (!(test-path $trainDir\$lang.unicharambigs)) {
new-item "$trainDir\$lang.unicharambigs" -type file
set-content -path $trainDir\$lang.unicharambigs -value "v1"
}
echo "Putting it all together"
Invoke-Expression ".\combine_tessdata $trainDir\$lang."
Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.