scripts: Improve scrubts.py

Output statistics on removed duplicate messages and merged contexts. List remaining duplicate messages with identical source but different translation. Change-Id: If06f5cfc898c6261863cc53a3c464efead1d9890 Reviewed-by: Alessandro Portale <[email protected]>
author: Alessandro Portale <[email protected]> 2022-09-09 11:35:55 +0200
committer: Alessandro Portale <[email protected]> 2022-09-09 21:44:26 +0000
commit: c21b150aa86ea075fb8d09c3d82332be225425cb (patch)
tree: a36f0abc0d83c2b24fb1b1a230bd03a71faace47
parent: ed89cc730dd26009c1c0a2d609abc995c32cc585 (diff)
1 files changed, 76 insertions, 12 deletions
diff --git a/scripts/scrubts.py b/scripts/scrubts.py
index ede6af60d43..6426189a347 100644
--- a/scripts/scrubts.py
+++ b/scripts/scrubts.py
@@ -10,15 +10,16 @@
 
 import argparse
 import pathlib
-import re
 import sys
-
+from dataclasses import dataclass
 
 def rewriteLines(input, scrubbedContext, tsFilePath):
     result = []
     previouslyInContext = False
     contextWasPresent = False
     messageHashes = []
+    mergedContextsCount = 0
+    removedDuplicatesCount = 0
 
     lineIter = iter(input)
     for line in lineIter:
@@ -27,6 +28,7 @@ def rewriteLines(input, scrubbedContext, tsFilePath):
             if line.count(scrubbedContext + r"</name>") == 1: # It the context being scrubbed
                 contextWasPresent = True
                 if previouslyInContext: # Previous context was a scrubbed context, so merge them
+                    mergedContextsCount += 1
                     result = result[ : -2] # Remove recent:   </context>\n<context>
                     continue               # ...and skip this input line
                 else:
@@ -35,7 +37,7 @@ def rewriteLines(input, scrubbedContext, tsFilePath):
                 previouslyInContext = False
 
         # Message de-duplicating
-        if previouslyInContext and line.count(r"<message>") == 1: # message in scrubbed context
+        if previouslyInContext and line.count(r"<message") == 1: # message in scrubbed context
             # Iterate through message
             messageLines = [line]
             for messageLine in lineIter:
@@ -48,6 +50,8 @@ def rewriteLines(input, scrubbedContext, tsFilePath):
             if messageHash not in messageHashes:
                 result = result + messageLines
                 messageHashes.append(messageHash) # Append if not a duplicate
+            else:
+                removedDuplicatesCount += 1
 
             continue
 
@@ -57,27 +61,87 @@ def rewriteLines(input, scrubbedContext, tsFilePath):
         error = f"Context \"{scrubbedContext}\" was not found in {tsFilePath}"
         sys.exit(error)
 
+    print (f"{tsFilePath}:")
+    print (f"  {removedDuplicatesCount} identical duplicate message(s) removed.")
+    print (f"  {mergedContextsCount} occurrence(s) of context \"{scrubbedContext}\" merged.")
+
     return result
 
 
+def findDistinctDuplicates(input, scrubbedContext, tsFilePath):
+    inContext = False
+
+    @dataclass
+    class Translation:
+        lineNr: int
+        translationXml: []
+
+    @dataclass
+    class Source:
+        sourceXml: str
+        translations: []
+
+    messages = {}
+
+    lineIter = iter(input)
+    for lineNr, line in enumerate(lineIter):
+        if line.count(r"</name>") == 1: # Any new context
+            inContext = (line.count(scrubbedContext + r"</name>") == 1)
+            continue
+        if line.count(r"<message") == 0:
+            continue
+        if inContext:
+            sourceXml = []
+            for sourceLine in lineIter: # <source>..</source> (possibly multi-line)
+                sourceXml.append(sourceLine)
+                if sourceLine.count(r"</source>") == 1:
+                    break
+            sourceXmlHash = hash(str(sourceXml))
+            translationXml = []
+            for translationLine in lineIter: #  <translation>..</translation> (possibly multi-line)
+                translationXml.append(translationLine)
+                if translationLine.count(r"</translation>") == 1:
+                    break
+            translation = Translation(lineNr + 1, translationXml)
+            if sourceXmlHash in messages:
+                messages[sourceXmlHash].translations.append(translation)
+            else:
+                messages[sourceXmlHash] = Source(sourceXml, [translation])
+
+    for sourceId in messages:
+        source = messages[sourceId]
+        translationsCount = len(source.translations)
+        if translationsCount > 1:
+            print (f"\n{translationsCount} duplicates for source:")
+            for sourceXmlLine in source.sourceXml:
+                 print (sourceXmlLine.rstrip())
+            for translation in source.translations:
+                print (f"\n{tsFilePath}:{translation.lineNr}")
+                for translationXmlLine in translation.translationXml:
+                     print (translationXmlLine.rstrip())
+
+
 def processTsFile(tsFilePath, scrubbedContext):
     with open(tsFilePath, 'r') as tsInputFile:
         lines = tsInputFile.readlines()
 
     result = rewriteLines(lines, scrubbedContext, tsFilePath)
+    if lines != result:
+        with open(tsFilePath, 'w') as tsOutputFile:
+            for line in result:
+                tsOutputFile.write(line)
 
-    with open(tsFilePath, 'w') as tsOutputFile:
-        for line in result:
-            tsOutputFile.write(line)
+    findDistinctDuplicates(result, scrubbedContext, tsFilePath)
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Rewrites a .ts file, removing duplicate messages '
-                                                 'of a specified translation context and joining '
-                                                 'adjacent occurrences of that context. '
-                                                 'Unlike lrelease and lconvert, this script does '
-                                                 'an exact comparison of the whole <message/> xml '
-                                                 'tag.')
+    parser = argparse.ArgumentParser(
+        description='''Rewrites a .ts file, removing identical duplicate messages of a specified
+                       translation context and joining adjacent occurrences of that context.
+                       Unlike lrelease and lconvert, this script does an exact comparison of the
+                       whole <message/> xml tag when removing duplicates.
+                       Subsequently, the remaining duplicate messages with identical source but
+                       different translation are listed with filename:linenumber.''')
     parser.add_argument('tsfile',
                         help='The .ts file to be processed.',
                         type=pathlib.Path)
author	Alessandro Portale <[email protected]>	2022-09-09 11:35:55 +0200
committer	Alessandro Portale <[email protected]>	2022-09-09 21:44:26 +0000
commit	c21b150aa86ea075fb8d09c3d82332be225425cb (patch)
tree	a36f0abc0d83c2b24fb1b1a230bd03a71faace47
parent	ed89cc730dd26009c1c0a2d609abc995c32cc585 (diff)