from wxPython.wx import * from wxPython.html import * import math import sys import re import time import operator # maybe move this to the method area? # add as needed...(NOT USED YET) # these are to designate words which are almost always hyphenated hyphenPrefixes = { 'dis':'', 're':'' } hyphenSuffixes = { 'ed':'', 'es':'' } appVersion = '.3.2' ID_STUB = 100 ID_OPEN = 101 ID_SAVE = 102 ID_PARSE = 103 ID_HYPHEN = 104 ID_HEADERS= 105 ID_NUMBERS= 106 ID_CAPS = 107 ID_SPACES = 108 ID_EXIT = 109 ID_HELP = 140 ID_LICENSE= 141 ID_ABOUT = 142 ############################# Class defs ############################# class MainFrame(wxFrame): def __init__(self, parent, ID, title): wxFrame.__init__(self, parent, ID, title, wxDefaultPosition, wxSize(750, 750)) if wxPlatform == '__WXMSW__': self.icon = wxIcon('imagery\\book.ico', wxBITMAP_TYPE_ICO ) self.SetIcon(self.icon) self.CreateStatusBar() self.SetStatusText("Tip: All modifications performed on the \'Before\' pane will be displayed in the \'After\' pane.") #create the menubar fileMenu = wxMenu() fileMenu.Append(ID_OPEN, "&Open", "Open a text file") fileMenu.Append(ID_SAVE, "&Save", "Save the new file") fileMenu.AppendSeparator() fileMenu.Append(ID_EXIT, "E&xit", "Terminate the program") parseMenu = wxMenu() parseMenu.Append(ID_PARSE, "&Go", "Do it all") parseMenu.Append(ID_HEADERS, "&Headers only", "Remove all page headers") parseMenu.Append(ID_HYPHEN, "&Hyphens only", "Change only hyphens") parseMenu.Append(ID_NUMBERS, "&Numbers only", "Remove all page numbers") parseMenu.Append(ID_CAPS, "&Caps only", "Fix CAPITALIZED words") parseMenu.Append(ID_SPACES, "&Space sentences only", "Double-space after sentences") helpMenu = wxMenu() helpMenu.Append(ID_HELP, "&Help", "Index and glossary") helpMenu.Append(ID_LICENSE, "&License info", "A limited freeware license") helpMenu.AppendSeparator() helpMenu.Append(ID_ABOUT, "&About", "More information about this program") self.menuBar = wxMenuBar() self.menuBar.Append(fileMenu, "&File") self.menuBar.Append(parseMenu, "&Parse") self.menuBar.Append(helpMenu, "&Help") self.SetMenuBar(self.menuBar) self.tb = self.CreateToolBar(wxTB_HORIZONTAL|wxNO_BORDER|wxTB_3DBUTTONS) self.tb.AddSimpleTool(10, wxBitmap('imagery\\go.bmp', wxBITMAP_TYPE_BMP), "Apply all", "Apply all of the methods except spell-check.") EVT_TOOL(self, 10, self.OnParse) self.tb.AddSeparator() self.tb.AddSimpleTool(50, wxBitmap('imagery\\caps.bmp', wxBITMAP_TYPE_BMP), "CAPITALS", "Fix all CAPITALIZED words at the beginning of sentences.") EVT_TOOL(self, 50, self.OnCaps) self.tb.AddSimpleTool(60, wxBitmap('imagery\\spaces.bmp', wxBITMAP_TYPE_BMP), "Spaces", "Insert double-space after sentences, where needed.") EVT_TOOL(self, 60, self.OnSpaces) self.tb.AddSimpleTool(65, wxBitmap('imagery\\paragraph.bmp', wxBITMAP_TYPE_BMP), "Space paragraphs", "Space paragraphs with one blank line.") EVT_TOOL(self, 65, self.OnParagraphs) self.tb.AddSimpleTool(40, wxBitmap('imagery\\headers.bmp', wxBITMAP_TYPE_BMP), "Remove headers", "Remove chapter headers.") EVT_TOOL(self, 40, self.OnHeaders) self.tb.AddSimpleTool(20, wxBitmap('imagery\\hyphen.bmp', wxBITMAP_TYPE_BMP), "Remove hyphens", "Remove hyphens at ends of lines, with prompting.") EVT_TOOL(self, 20, self.OnHyphen) self.tb.AddSimpleTool(30, wxBitmap('imagery\\number.bmp', wxBITMAP_TYPE_BMP), "Page numbers", "Remove page numbers, if alone on a line.") EVT_TOOL(self, 30, self.OnNumbers) self.tb.AddSimpleTool(85, wxBitmap('imagery\\length.bmp', wxBITMAP_TYPE_BMP), "Line length", "Adjust line lengths.") EVT_TOOL(self, 85, self.OnLineLength) self.tb.AddSimpleTool(75, wxBitmap('imagery\\check.bmp', wxBITMAP_TYPE_BMP), "Spell check", "Spell check the \'Before\' pane, with prompts") EVT_TOOL(self, 75, self.OnSpell) self.tb.AddSeparator() self.tb.AddSimpleTool(70, wxBitmap('imagery\\after2before.bmp', wxBITMAP_TYPE_BMP), "Switch results", "Replace \'Before\' tab-pane text with \'After\' tab-pane text") EVT_TOOL(self, 70, self.OnReplacePane) self.tb.Realize() EVT_MENU(self, ID_OPEN, self.OnOpen) EVT_MENU(self, ID_SAVE, self.OnFileSave) EVT_MENU(self, ID_PARSE, self.OnParse) EVT_MENU(self, ID_HYPHEN, self.OnHyphen) EVT_MENU(self, ID_NUMBERS,self.OnNumbers) EVT_MENU(self, ID_HEADERS,self.OnHeaders) EVT_MENU(self, ID_CAPS, self.OnCaps) EVT_MENU(self, ID_SPACES, self.OnSpaces) EVT_MENU(self, ID_HELP, self.OnHelp) EVT_MENU(self, ID_LICENSE,self.OnLicense) EVT_MENU(self, ID_ABOUT, self.OnAbout) EVT_MENU(self, ID_EXIT, self.ExitFrame) ##### create and add notebook pages self.nb = wxNotebook(self, -1) # create the before tab #self.nb.txtBefore = wxTextCtrl(self.nb, -1, "THIS is the de-\nfault test\n4\nOVER THE SIERRAS NEVADAS. 5\n\nthe text",wxPoint(0, 0), wxSize(75, 20), wxTE_MULTILINE|wxTE_RICH ) self.nb.txtBefore = wxTextCtrl(self.nb, -1, "",wxPoint(0, 0), wxSize(75, 20), wxTE_MULTILINE|wxTE_RICH ) #self.fontObj = wxFont(12, wxDEFAULT, wxNORMAL, wxNORMAL, false, "arial") self.fontObj = wxFont(11, wxMODERN, wxNORMAL, wxNORMAL, false) try: self.nb.txtBefore.LoadFile('test.txt') except IOError: dlg_m = wxMessageDialog (self, 'There was an error opening the new file.', 'Error!', wxOK) dlg_m.ShowModal() dlg_m.Destroy() self.nb.txtBefore.SetStyle(0, self.nb.txtBefore.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.AddPage(self.nb.txtBefore, "Before", TRUE) # create the After tab self.nb.txtAfter = wxTextCtrl(self.nb, -1, "",wxPoint(0, 0), wxSize(75, 20), wxTE_MULTILINE|wxTE_RICH ) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.AddPage(self.nb.txtAfter, "After") def OnHelp(self, event): helpFrame = HelpFrame(NULL, -1, "Help for OCR to Gutenberg text") helpFrame.Show(true) def OnLicense(self, event): licMessage = """ OCR to Gutenberg text is freeware for the preparation of e-texts for the Gutenberg Project. Any other use of the code, in whole or in part, is by permission only. Author: Ray Schumacher WWW: http://rjs.org Email: rays@rjs.org """ dlg = wxMessageDialog(self, licMessage, "License", wxOK | wxICON_INFORMATION) dlg.ShowModal() dlg.Destroy() def OnAbout(self, event): aboutMessage = string.join(["Welcome to\nOCR to Gutenberg text v", appVersion, "\n\n"], '') dlg = wxMessageDialog(self, aboutMessage, "About", wxOK | wxICON_INFORMATION) dlg.ShowModal() dlg.Destroy() def OnParse(self, event): wxBeginBusyCursor() txt = self.nb.txtBefore.GetValue() txt = self.FixCaps(txt) txt = self.spaceSentences(txt) txt = self.RemovePageHeaders(txt) txt = self.removePageNumbers(txt) txt = self.removeVolumeNumbers(txt) txt = self.RemoveHyphens(txt) txt = self.spaceParagraphs(txt) txt = self.formatHeaders(txt) txt = self.fixLineLength(txt) #txt = self.RemoveBrackets(txt) #txt = self.RemoveBlankLines(txt) self.nb.txtAfter.SetValue(txt) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(1) wxEndBusyCursor() def OnSpell(self, event): wxBeginBusyCursor() txt = self.nb.txtBefore.GetValue() txt = self.spellCheck(txt) self.nb.txtAfter.SetValue(txt) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(1) wxEndBusyCursor() def OnHyphen(self, event): wxBeginBusyCursor() self.nb.txtAfter.SetValue(self.RemoveHyphens(self.nb.txtBefore.GetValue())) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) #self.nb.SetSelection(1) wxEndBusyCursor() def OnCaps(self, event): wxBeginBusyCursor() self.nb.txtAfter.SetValue(self.FixCaps(self.nb.txtBefore.GetValue())) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(1) wxEndBusyCursor() def OnHeaders(self, event): wxBeginBusyCursor() self.nb.txtAfter.SetValue(self.RemovePageHeaders(self.nb.txtBefore.GetValue())) self.nb.txtAfter.SetValue(self.formatHeaders(self.nb.txtAfter.GetValue())) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(1) wxEndBusyCursor() def OnNumbers(self, event): wxBeginBusyCursor() self.nb.txtAfter.SetValue(self.removePageNumbers(self.nb.txtBefore.GetValue())) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(1) wxEndBusyCursor() def OnSpaces(self, event): wxBeginBusyCursor() self.nb.txtAfter.SetValue(self.spaceSentences(self.nb.txtBefore.GetValue())) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(1) wxEndBusyCursor() def OnParagraphs(self, event): wxBeginBusyCursor() self.nb.txtAfter.SetValue(self.spaceParagraphs(self.nb.txtBefore.GetValue())) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(1) wxEndBusyCursor() def OnLineLength(self, event): wxBeginBusyCursor() self.nb.txtAfter.SetValue(self.fixLineLength(self.nb.txtBefore.GetValue())) self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(1) wxEndBusyCursor() def OnReplacePane(self, event): self.nb.txtBefore.SetValue(self.nb.txtAfter.GetValue()) self.nb.txtAfter.SetValue('') self.nb.txtBefore.SetStyle(0, self.nb.txtBefore.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(0) def formatHeaders(self, txt): ## Title and Part type headers--5 returns after 6 before Chapter headers-- ## 3 returns before first line. Chapter ends--4 returns before next chapter header. lineList = string.split(txt, "\n") titlePattern = re.compile('^CHAPTER\s+[IVXC\d]+\.?\s*$') # search in the first 5 lines for thisLine in range(len(lineList)): m = titlePattern.search(lineList[thisLine]) if m: # add 2 blank lines before the text for i in range(2): lineList.insert(thisLine+2, '') # add 4 blank lines before the chapter title for i in range(4): lineList.insert(thisLine+1, '') # add 3 blank lines before the chapter title for i in range(3): lineList.insert(thisLine, '') break txt = string.join(lineList, '\n') self.SetStatusText("Status: Title headers are spaced properly.") return txt def RemovePageHeaders(self, txt): lineList = string.split(txt, "\n") headerPattern1 = re.compile('^\s*(\d+)\s+([A-Z\.]+\s*)+$') headerPattern2 = re.compile('^\s*([A-Z\-\,\']+\.?\s+)+(\d+)$') headerPattern3 = re.compile('^\s*([A-Z\-\,\']+\s+)*([A-Z\-]+\.)\s?$') numPattern = re.compile('^\s*\d+\s*$') # keeps the line numbers! progressdlg = wxProgressDialog("Removing headers", "Please wait...", len(lineList), self, wxPD_AUTO_HIDE | wxPD_CAN_ABORT | wxPD_APP_MODAL | wxPD_ESTIMATED_TIME | wxPD_ELAPSED_TIME ) for thisLine in range(len(lineList)-2): m = headerPattern1.search(lineList[thisLine]) if m: lineList[thisLine] = m.group(1) else: m = headerPattern2.search(lineList[thisLine]) if m: lineList[thisLine] = m.group(2) else: m = headerPattern3.search(lineList[thisLine]) #print 'm', m, thisLine m1 = numPattern.search(lineList[thisLine-1]) #print 'm1', m1, thisLine m2 = numPattern.search(lineList[thisLine-2]) #print 'm2', m2, thisLine m3 = numPattern.search(lineList[thisLine+1]) #print 'm3', m3, thisLine m4 = numPattern.search(lineList[thisLine+2]) if m and (m1 or m2 or m3 or m4): lineList[thisLine] = '' if(operator.mod(thisLine, 10)==0 ): progressdlg.Update(thisLine) txt = string.join(lineList, '\n') progressdlg.Destroy() self.SetStatusText("Status: Page headers have been removed.") return txt def removeVolumeNumbers(self, txt): ## sub(pattern, repl, string[, count]) ## any 'vol. 11-23.' volPattern = re.compile('vol\.? \d{1,3}-\d{1,3}\.?') txt = volPattern.sub('\n', txt) self.SetStatusText("Status: All volume notes removed..") return txt def spaceParagraphs(self, txt): lineList = string.split(txt, "\n") # any line less than 45 chars ends a paragraph (?!) paraPattern = re.compile('^.{3,55}$') # any indented line starts a paragraph (?!) indentPattern = re.compile('^ ') thisLine = 21 while thisLine < len(lineList): m1 = paraPattern.search(lineList[thisLine]) if m1: # insert a blank line after this lineList.insert(thisLine+1, '') thisLine = thisLine m2 = indentPattern.search(lineList[thisLine]) if m2 and not m1: # insert a blank line before this lineList.insert(thisLine, str(thisLine)) thisLine = thisLine + 1 thisLine = thisLine + 1 txt = string.join(lineList, '\n') self.SetStatusText("Status: All paragraphs are spaced with one blank line.") return txt """ E.Two spaces after each sentence [watch for ! or ? that do NOT end sentences, then use only one space]. J. Elipses [word. . .] have no spaces before or after ".'s" unless they end a sentence with four [. . . . ] then it is a sentence ending. . .with two spaces. . . . Next is a new sentence. K. Dashes will be--dashes--with no extra spaces around them """ def spaceSentences(self, txt): # this version has a bug; the colored hyphen display is offset by 1 when doing 'all' # maybe split each line by the pattern, and use the len() of each to draw color lineList = string.split(txt, "\n") endPattern = re.compile('[?!.]\s(["A-Z])') doubleSpacePattern = re.compile('\. ') spaced = 0 progressdlg = wxProgressDialog("Double spacing", "Please wait...", len(lineList), self, wxPD_AUTO_HIDE | wxPD_CAN_ABORT | wxPD_APP_MODAL | wxPD_ESTIMATED_TIME | wxPD_ELAPSED_TIME ) for thisLine in range(len(lineList)): # insert the double space (lineList[thisLine], number) = endPattern.subn(r'. \1', lineList[thisLine]) # was there a split? if number > 1: # lineList[thisLine] now has double spaces, so search for 'em thisLinesSentances = doubleSpacePattern.split(lineList[thisLine]) previousLength = 0 for word in range(len(thisLinesSentances)-1): spacePosition = self.nb.txtBefore.XYToPosition(len(thisLinesSentances[word])+previousLength, thisLine) #self.nb.txtBefore.SetStyle(spacePosition+1, spacePosition+2, wxTextAttr(wxNullColour, "CYAN")) previousLength = previousLength + len(thisLinesSentances[word]) + 1 spaced = spaced + number - 1 #if(operator.mod(thisLine, 10)==0 ): progressdlg.Update(thisLine) txt = string.join(lineList, '\n') progressdlg.Destroy() self.SetStatusText(string.join(["Status:", str(spaced),"sentences were spaced."])) return txt def FixCaps(self, txt): capsPattern1 = re.compile('^([A-Z])([A-Z]+)(\s+[a-z].*)$') # AN ALL-CAPS WORD FOLLOWED BY A LOWER CASE LETTER #capsPattern2 = re.compile('^([A-Z]{2,})') # AN ALL-CAPS WORD lineList = string.split(txt, '\n') capsFixed = 0 for thisLine in range(len(lineList)): m = capsPattern1.search(lineList[thisLine]) if m: fixedCase = string.join([m.group(1), string.lower(m.group(2))], '') #capsPattern2.sub(fixedCase, lineList[thisLine]) lineList[thisLine] = string.join([fixedCase, m.group(3)], '') capsFixed += 1 txt = string.join(lineList, '\n') self.SetStatusText(string.join(["Status:", str(capsFixed),"words have had case adjusted."])) return txt def RemoveBrackets(self, txt): blankPattern = re.compile('<.+>', re.M) txt = blankPattern.sub('', txt) return txt def RemoveBlankLines(self, txt): blankPattern = re.compile('\n[\s]*\n') txt = blankPattern.sub('\n', txt) return txt def removePageNumbers(self, txt): numPattern = re.compile('\n\s*\d+\s*\n') txt = numPattern.sub('\n', txt) return txt def spellCheck(self, txt): dict = {} dict_has_key = dict.has_key for line in open("english-words").xreadlines(): word = line[:-1] if word: dict[word] = 1 skippedDict = {} skippedDict_has_key = skippedDict.has_key for line in open("skip_words").xreadlines(): word = line[:-1] if word: skippedDict[word] = 1 newWords = {} newSkipWords= {} lineList = string.split(txt, "\n") splitPattern = re.compile(r'(\W+)') wordPattern = re.compile(r'^[a-zA-Z]+$') for thisLine in range(len(lineList)): thisLinesWords = splitPattern.split(lineList[thisLine]) previousLength = 0 stopDialog = 0 for i in range(len(thisLinesWords)): if wordPattern.search(thisLinesWords[i]) \ and not dict_has_key(thisLinesWords[i]) \ and not skippedDict_has_key(thisLinesWords[i]) \ and not skippedDict_has_key(string.lower(thisLinesWords[i])) \ and not dict_has_key(string.lower(thisLinesWords[i])): wordPosition = self.nb.txtBefore.XYToPosition(previousLength, thisLine) # not a word # highlight the word in yellow self.nb.txtBefore.SetStyle(wordPosition, wordPosition+len(thisLinesWords[i]), wxTextAttr("BLACK", "YELLOW")) choiceDlg = wxSingleChoiceDialog(self, thisLinesWords[i], 'Unknown word', ['Add', 'Skip', 'Change']) # scroll the window to the line and position self.nb.txtBefore.ShowPosition(wordPosition-50) if choiceDlg.ShowModal() == wxID_OK: if choiceDlg.GetStringSelection() == 'Add': dict[string.lower(thisLinesWords[i])] = 1 newWords[string.lower(thisLinesWords[i])] = 1 # un-highlight the word in yellow self.nb.txtBefore.SetStyle(wordPosition, wordPosition+len(thisLinesWords[i]), wxTextAttr("BLACK", "WHITE")) elif choiceDlg.GetStringSelection() == 'Skip': newSkipWords[string.lower(thisLinesWords[i])] = 1 # highlight the word in grey self.nb.txtBefore.SetStyle(wordPosition, wordPosition+len(thisLinesWords[i]), wxTextAttr("BLACK", "LIGHT GREY")) else: dlg = wxTextEntryDialog(self, string.join(['Enter the new spelling for:', thisLinesWords[i]]), 'Change', '') if dlg.ShowModal() == wxID_OK: newSpelling = dlg.GetValue() changePattern = re.compile(thisLinesWords[i]) lineList[thisLine] = changePattern.sub(newSpelling, lineList[thisLine]) dlg.Destroy() else: # drop out of the outer for loop stopDialog = 1 break choiceDlg.Destroy() previousLength = previousLength + len(thisLinesWords[i]) if stopDialog: break txt = string.join(lineList, '\n') dlg = wxMessageDialog(self, string.join(["Status:", str(len(newWords)),"words added.\n", str(len(newSkipWords)),"words skipped."]), "Complete", wxOK | wxICON_INFORMATION) dlg.ShowModal() dlg.Destroy() self.SetStatusText('') self.updateDictionary(newWords, 'english-words', 2) self.updateDictionary(newSkipWords, 'skip_words', 1) return txt def RemoveHyphens(self, txt): self.SetStatusText("Status: Starting hyphen processing.") dict = {} dict_has_key = dict.has_key for line in open("english-words").xreadlines(): word = line[:-1] if word: dict[word] = 1 skippedDict = {} skippedDict_has_key = skippedDict.has_key for line in open("skip_words").xreadlines(): word = line[:-1] if word: skippedDict[word] = 1 newWords = {} newSkipWords= {} lineList = string.split(txt, "\n") #numOfDashes = string.count(txt, "-") #capsWordsPattern = re.compile('([[^a-z][A-Z]]{2,} ){2,}') #2 or more words with 2 or more letters, all caps? capsWordsPattern = re.compile('([A-Z]{2,}\s+)+[A-Z]{2,}') #2 or more words with 2 or more letters, all caps? bracketsPattern = re.compile('^<.+>$') # a line with only a <...> comment preHyphenPattern = re.compile('(.+ ([a-zA-Z]+))-$') # the first half of the word #postHyphenPattern = re.compile('^([a-zA-Z]+[,.:;\']? ?)') # why not??? postHyphenPattern = re.compile('^([a-zA-Z]+)([,.:;\'])?\s?') # second half (on next line...) numPattern = re.compile('^\d+$') #lengthShortened = 0 thisPosition = 0 numberRemoved = 0 added = 0 skipped = 0 for thisLine in range(len(lineList)): lineList[thisLine] = string.strip(lineList[thisLine]) m1 = preHyphenPattern.search(lineList[thisLine]) if m1: # the next line might be blank or a header! so, skip to the next relavent line tempLine = thisLine skippedLength = 0 while 1: if lineList[tempLine+1] == '' or numPattern.search(lineList[tempLine+1]) or capsWordsPattern.search(lineList[tempLine+1]) or bracketsPattern.search(lineList[tempLine+1]): tempLine = tempLine+1 skippedLength = len(lineList[tempLine+1]) else: break m2 = postHyphenPattern.search(lineList[tempLine+1]) if m2: testWord = string.join([m1.group(2), m2.group(1)], '') testWordHyphenated = string.join([m1.group(2), m2.group(1)], '-') # determione the position values #hyphenPosition = self.nb.txtBefore.XYToPosition(len(lineList[thisLine])+lengthShortened-1, thisLine) #suffixEndPosition = self.nb.txtBefore.XYToPosition(len(m2.group(1)), tempLine+1) # scroll the window to the line and position #self.nb.txtBefore.ShowPosition(hyphenPosition) # insert a test here for hyphenPrefixes and hyphenSuffixes? if skippedDict_has_key(testWordHyphenated): skipped = skipped + 1 # now, decide whether to de-hyphenate(!) or not # test first half, on first line elif dict_has_key(m1.group(2)): # test to see is the concatenation is a known word if dict_has_key(testWord): #now, ver 3.6, concatenate automatically... lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '') if m2.group(2): # add the punctuation lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '') # remove the suffix from the second lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1) #lengthShortened = len(m2.group(1)) + 1 thisPosition += len(m2.group(1)) + 1 #self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "YELLOW")) else: # first half known, but the whole concatenation is not a known word message = string.join([testWordHyphenated, '\n', 'line: ', str(thisLine), '\n', lineList[thisLine], '\n', lineList[tempLine+1]], '') choiceDlg = wxSingleChoiceDialog(self, message, 'Unknown word.', ['Concatenate', 'Hyphenate']) if choiceDlg.ShowModal() == wxID_OK: if choiceDlg.GetStringSelection() == 'Concatenate': dict[testWord] = 1 lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '') if m2.group(2): lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '') # remove the suffix from the second lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1) #lengthShortened = len(m2.group(1)) + 1 thisPosition += len(m2.group(1)) + 1 newWords[testWord] = 1 added = added + 1 #self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "YELLOW")) elif choiceDlg.GetStringSelection() == 'Hyphenate': #lengthShortened = 0 newSkipWords[string.lower(testWordHyphenated)] = 1 #self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "RED")) else: # drop out of the outer for loop break choiceDlg.Destroy() # test the second half, on the next line elif dict_has_key(m2.group(1)): # test to see if the concatenation is a word if dict_has_key(testWord): #now, ver 3.6, concatenate automatically... lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '') if m2.group(2): # add the punctuation lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '') # remove the suffix from the second lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1) #lengthShortened = len(m2.group(1)) + 1 thisPosition += len(m2.group(1)) + 1 #self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "YELLOW")) else: # the whole concatenation is not a word! message = string.join([testWordHyphenated, '\n', 'line: ', str(thisLine), '\n', lineList[thisLine], '\n', lineList[tempLine+1]], '') choiceDlg = wxSingleChoiceDialog(self, message, 'Unknown word..', ['Concatenate', 'Hyphenate']) if choiceDlg.ShowModal() == wxID_OK: if choiceDlg.GetStringSelection() == 'Concatenate': dict[testWord] = 1 lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '') if m2.group(2): lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '') # remove the suffix from the second lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1) #lengthShortened = len(m2.group(1)) + 1 thisPosition += len(m2.group(1)) + 1 newWords[testWord] = 1 added = added + 1 #self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "YELLOW")) elif choiceDlg.GetStringSelection() == 'Hyphenate': #lengthShortened = 0 newSkipWords[string.lower(testWordHyphenated)] = 1 #self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "RED")) else: # drop out of the outer for loop break choiceDlg.Destroy() else: # neither half is a word, so we must concatenate lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '') if m2.group(2): lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '') lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1) #lengthShortened = len(m2.group(1)) + 1 thisPosition += len(m2.group(1)) + 1 # highlight the hyphenated word in green #self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), suffixEndPosition, wxTextAttr("BLACK", "GREEN")) numberRemoved = numberRemoved+ 1 else: #lengthShortened = 0 print 'Error: No next line match for line', thisLine else: # no hyphen on this line #lengthShortened = 0 pass thisPosition += len(lineList[thisLine])+1 txt = string.join(lineList, '\n') self.SetStatusText(string.join(["Status:", str(numberRemoved),"hyphens auto-removed.", str(len(newWords)),"words added.", str(len(newSkipWords)+skipped),"words skipped."])) thisPosition += len(newSkipWords)+skipped self.nb.txtBefore.ShowPosition(thisPosition) self.nb.txtBefore.SetStyle(thisPosition-10, thisPosition, wxTextAttr("BLACK", "YELLOW")) self.updateDictionary(newWords, 'english-words', 2) self.updateDictionary(newSkipWords, 'skip_words', 1) return txt def fixLineLength(self, txt): ## problem where a line is long after a blank line: does not fix ## We try to average 65, with 55 to 75 being short and long other than for emergencies, ## which will extend to 51 to 79. # assumes that paragraphs and headers have been spaced lineList = string.split(txt, "\n") newLineList = [] desiredLength = 70 blankPattern = re.compile('^\d*\s*$') capsWordsPattern = re.compile('([A-Z]{2,}\s+)+[A-Z\.]{2,}') #2 or more words with 2 or more letters, all caps headerPattern1 = re.compile('^(\d+)\s+([A-Z.]+\s*)+$') headerPattern2 = re.compile('^([A-Z.]+\s+)+(\d+)$') hyphenPattern = re.compile('[a-zA-Z]-[a-zA-Z]') hyphenEndPattern = re.compile('-$') thisLine = 0 progressdlg = wxProgressDialog("Setting line lengths", "Please wait...", len(lineList), self, wxPD_AUTO_HIDE | wxPD_CAN_ABORT | wxPD_APP_MODAL | wxPD_ESTIMATED_TIME | wxPD_ELAPSED_TIME ) lineList.append('\n') #search for the start of a new paragraph while thisLine < len(lineList): #print thisLine if capsWordsPattern.search(lineList[thisLine]) or blankPattern.search(lineList[thisLine]): newLineList.append(lineList[thisLine]) #print '\tcaps', lineList[thisLine] paragraphStart = thisLine + 1 progressdlg.Update(thisLine) # point to the next line in lineList (beforeText) thisLine += 1 else: # it is a start line for a new paragraph #print '\telse', thisLine paragraph = lineList[thisLine] thisLine += 1 # assume that short lines also end a paragraph (!) #while len(lineList[thisLine]) > 55 and not blankPattern.search(lineList[thisLine+1]): while len(lineList[thisLine]) > 1: # add the line's words paragraph = string.join([paragraph, lineList[thisLine]]) #print '\tinner', thisLine, paragraph thisLine += 1 # add this next, the ending line, too if blankPattern.search(lineList[thisLine]): pass else: paragraph = string.join([paragraph, lineList[thisLine]]) # create an array of words paragraphArray = string.split(paragraph) #print paragraphArray tempLine = paragraphArray.pop(0) while len(paragraphArray): # there are words to be added, has the next line been started? (might be blanks...) if tempLine: # the temp line has been started, does the next word have a hyphen? if hyphenPattern.search(paragraphArray[0]): # see if the whole double word will fit best if (len(tempLine)+len(paragraphArray[0])-desiredLength < desiredLength-len(tempLine)) and (len(tempLine)+len(paragraphArray[0]) < 79): # if the current end word is hyphenated, join the next word without a space if hyphenEndPattern.search(tempLine): # add the hyphenated word pair tempLine = string.join([tempLine, paragraphArray.pop(0)], '') else: # add the word with a space tempLine = string.join([tempLine, paragraphArray.pop(0)]) # will the first half fit else: splitWord = string.split(paragraphArray[0], '-') if ((len(splitWord[0])+ len(tempLine)+1-desiredLength) < desiredLength-len(tempLine)) and (len(tempLine)+len(splitWord[0]) < 79): if hyphenEndPattern.search(tempLine): # add the first half hyphenated word tempLine = string.join([tempLine, splitWord[0]], '') else: # add the first half hyphenated word, spaced tempLine = string.join([tempLine, splitWord[0]]) # put a hyphen on the end tempLine = string.join([tempLine, '-'], '') paragraphArray[0] = splitWord[1] newLineList.append(tempLine) # must form a new line... tempLine = paragraphArray.pop(0) # the first half won't even fit else: newLineList.append(tempLine) tempLine = paragraphArray.pop(0) # there is no hyphen... elif (len(tempLine)+len(paragraphArray[0])-desiredLength < desiredLength-len(tempLine)) and (len(tempLine)+len(paragraphArray[0]) < 79): # add the word tempLine = string.join([tempLine, paragraphArray.pop(0)]) else: # the next word will not fit!, write the line... newLineList.append(tempLine) tempLine = paragraphArray.pop(0) else: # start the next line tempLine = paragraphArray.pop(0) if len(tempLine): # words are left in tempLine after the while, write the last line.. newLineList.append(tempLine) tempLine = '' # done, write it txt = string.join(newLineList, '\n') progressdlg.Destroy() return txt def updateDictionary(self, hash, fileName, sleepTime): if len(hash): f=open(fileName, 'a+') for aKey in hash.keys(): line = string.join([aKey, '\n'], '') f.write(line) f.close time.sleep(sleepTime) self.SetStatusText(string.join(["Status: The", fileName,"dictionary has been updated with", str(len(hash)), "words."])) def ExitFrame(self, event): self.Close(true) # ---------------------------------------------------------------------------------------- # Some handlers. # ---------------------------------------------------------------------------------------- def OnOpen(self, event): dlg = wxFileDialog (self, "Select a text file to import", ".", "", "*.*", wxOPEN) if dlg.ShowModal() == wxID_OK: try: self.nb.txtBefore.LoadFile(dlg.GetPath()) except IOError: dlg_m = wxMessageDialog (self, 'There was an error opening the new file.', 'Error!', wxOK) dlg_m.ShowModal() dlg_m.Destroy() dlg.Destroy() self.nb.txtBefore.SetStyle(0, self.nb.txtBefore.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj)) self.nb.SetSelection(0) def OnFileSave(self, event): dlg = wxFileDialog(self, "Save to file", ".", "", "*.txt", wxSAVE) if dlg.ShowModal() == wxID_OK: for path in dlg.GetPaths(): try: f=open(path, 'w') f.write(self.nb.txtAfter.GetValue()) f.close except IOError: dlg_m = wxMessageDialog (self, 'There was an error writing.', 'Error!', wxOK) dlg_m.ShowModal() dlg_m.Destroy() dlg.Destroy() def OnCloseMe(self, event): self.Close(true) def OnCloseWindow(self, event): self.Destroy() class HelpFrame(wxFrame): def __init__(self, parent, ID, title): wxFrame.__init__(self, parent, ID, title, wxDefaultPosition, wxSize(600, 550)) if wxPlatform == '__WXMSW__': self.icon = wxIcon('imagery\\book.ico', wxBITMAP_TYPE_ICO ) self.SetIcon(self.icon) ##### create and add notebook pages self.htmlHelp = wxHtmlWindow(self, -1) try: self.htmlHelp.LoadPage('help/index.htm') except IOError: dlg_m = wxMessageDialog (self, 'There was an error opening the file.', 'Error!', wxOK) dlg_m.ShowModal() dlg_m.Destroy() ############### Main application class ########### class OCRApp(wxApp): def OnInit(self): frame = MainFrame(NULL, -1, "OCR to Gutenberg text") frame.Show(true) self.SetTopWindow(frame) return true app = OCRApp(0) app.MainLoop()