Skip to content

Commit

Permalink
This commit represents a fix for the issue reported by Roland F in wh…
Browse files Browse the repository at this point in the history
…ich the presence of an \nd* inline binding end marker inside the middle of a word, was causing the remaining part of the word to be parsed as a separate source phrase (leaving what appeared as a space, but was a pile gap between the parts of the word).

To effect a fix I added a wordSuffix ref parameter to the ParsePostWordPunctsAndEndMkrs() call to cover the situation where an inline binding end marker such as \nd* occurs in the middle of a word. The wordSuffix ref parameter will return any such word-building suffix part of the word that occurs directly following the inline binding end marker. Otherwise, as Roland F reported some of his words that had, for example, \nd<word-first-part>\nd*<word-suffix-part><space-between-words> were getting broken up resulting in the <word-suffix-part> being parsed as a separate source phrase. We can also utilize any returned wordSuffix to update the m_key and m_srcPhrase members with such suffix Some code for doing this was borrowed from ParseAWord().
Ensured that the m_srcSinglePattern value gets any form of the ParseWord() function call that has an embedded inline binding end marker within it. This makes it possible to reconstruct the correct position of an embedded inline binding end marker when exporting the source text.
This commit also modifies the export source text routine FromSingleMakeSstr2() to utilize the m_srcSinglePattern value to achieve more accurate positioning of inline binding end markers like \nd* for more robust exports of the source text.
  • Loading branch information
pngbill committed Dec 27, 2024
1 parent 5e25f1d commit 908292f
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 18 deletions.
158 changes: 148 additions & 10 deletions source/Adapt_ItDoc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21517,7 +21517,11 @@ int CAdapt_ItDoc::ParseDate(wxChar* pChar, wxChar* pEnd, wxString spacelessPunct

// BEW 7Jun23 created next, for parsing final puncts, which may be all or some detached by preceding
// whitespace(s), and getting to the puncts may require parsing first over one or more inlineBindingEndMarkers
wxChar* CAdapt_ItDoc::ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd, CSourcePhrase* pSrcPhrase, int& itemLen, wxString spacelessPuncts)
// whm 26Dec2024 added an additional reference parameter wxString& wordSuffix to the header of this function
// which allows returning to the caller any word suffix that exists after the position of an inline binding end
// marker.
wxChar* CAdapt_ItDoc::ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd, CSourcePhrase* pSrcPhrase,
int& itemLen, wxString& wordSuffix, wxString spacelessPuncts)
{
// pChar comes in, pointing at the first wxChar following whatever ParseAWord() parsed over, and the caller will have
// a len value which is not zero. We parse over binding endMkr if present, then over puncts (detached or not) - and there
Expand Down Expand Up @@ -21559,6 +21563,7 @@ wxChar* CAdapt_ItDoc::ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd,
wxString augWholeMkr = wxEmptyString;
int numEndPuncts; numEndPuncts = 0; // init
wxString strEndPuncts; strEndPuncts = wxEmptyString; // init
bool bParsedAndStoredInlineBindingEndMkr = FALSE;

int itemSpan = 0; // this is an item length valid only for the current iteration, the loop may iterate several times, and
// each time set a new itemItemSpan when parsed data is stored in pSrcPhrase; but we need to accumulate
Expand All @@ -21578,6 +21583,12 @@ wxChar* CAdapt_ItDoc::ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd,
{
int halt_here = 1; wxUnusedVar(halt_here); // avoid warning variable initialized but not referenced
}
#endif
#if defined (_DEBUG)
if (pSrcPhrase->m_nSequNumber == 5)
{
int halt_here = 1; wxUnusedVar(halt_here); // avoid warning variable initialized but not referenced
}
#endif
if (*ptr == gSFescapechar)
{
Expand All @@ -21598,6 +21609,7 @@ wxChar* CAdapt_ItDoc::ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd,
itemLenAccum += itemSpan;
ptr += itemSpan; // ptr has advanced
itemLen += itemSpan;
bParsedAndStoredInlineBindingEndMkr = TRUE;
}
} // end of TRUE block for test: if (*ptr = gSFescapeChar)
else
Expand Down Expand Up @@ -21652,6 +21664,35 @@ wxChar* CAdapt_ItDoc::ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd,
}
#endif

// whm 26Dec2024 added. We need to check if the immediate character following the parsing and storing of
// an inline binding end marker such as \nd* is a word-building character or not. If so, we iterate to
// accumulate as many word-building chars as exist, and return them via the wordSuffix ref parameter, and
// also update the returned ptr value and itemLen to include the length of any encountered wordSuffix chars.
// Some code here borrowed from ParseAWord().
wordSuffix.Empty(); // Initialize
if (bParsedAndStoredInlineBindingEndMkr)
{
bParsedAndStoredInlineBindingEndMkr = FALSE;
// ptr will now be pointing at the first char past the inline binding end marker that was parsed
// and stored above. We iterate the ptr until we encounter pEnd or any non-word-building character
// and store any chars that qualify in the wordSuffix ref parameter for processing by the caller.
//wxChar* pAux2;
//pAux2 = ptr; // use pAux for parsing over one or more following puncts; do all options for each iteration
wxChar zwsp = (wxChar)0x200B;
wxChar bar = _T('|');
bool bNotWhitespace = !IsWhiteSpace(ptr);
bool bCanProceed = CanParseForward(ptr, spacelessPuncts, pEnd);
while (bNotWhitespace && bCanProceed && (*ptr != gSFescapechar) && (*ptr != bar) && (*ptr != zwsp) && !(ptr == pEnd))
{
wordSuffix += *ptr;
ptr++;
itemLen++;
// By using IsWhiteSpace(ptr) I get automatic zwsp support
bNotWhitespace = !IsWhiteSpace(ptr);
bCanProceed = CanParseForward(ptr, spacelessPuncts, pEnd);
}
}

// Now ptr points at one of these options (longest first, shortest last):
// 1. One or more substrings of type: space + punct(s)
// 2. A mix of undetached puncts and substrings of type: space + punct(s)
Expand Down Expand Up @@ -43430,7 +43471,7 @@ int CAdapt_ItDoc::ParseWord(wxChar* pChar,
wxString pointsAt = wxString(ptr, 16);
wxLogDebug(_T("ParseWord() line %d , pSrcPhrase->m_nSequNumber = %d , m_key= [%s] , len= %d , m_adaption=[%s], m_markers=[%s] , pointsAt= [%s]"),
__LINE__, pSrcPhrase->m_nSequNumber, pSrcPhrase->m_key.c_str(), len, pSrcPhrase->m_adaption.c_str(), pSrcPhrase->m_markers.c_str(), pointsAt.c_str());
if (pSrcPhrase->m_nSequNumber >= 1)
if (pSrcPhrase->m_nSequNumber >= 5)
{
int halt_here = 1; wxUnusedVar(halt_here);
}
Expand All @@ -43443,9 +43484,30 @@ int CAdapt_ItDoc::ParseWord(wxChar* pChar,
__LINE__, pSrcPhrase->m_nSequNumber, pSrcPhrase->m_key.c_str());
}
#endif

ptr = ParsePostWordPunctsAndEndMkrs(ptr, pEnd, pSrcPhrase, itemLen, spacelessPuncts);
// whm 26Dec2024 added wordSuffix ref parameter to the ParsePostWordPunctsAndEndMkrs() function to
// cover the situation where an inline binding end marker such as \nd* occurs in the middle of a word.
// The wordSuffix ref parameter will return any such word-building suffix part of the word that
// occurs directly after the inline binding end marker. Otherwise, as Roland F reported some of
// his words that had, for example, \nd<word-first-part>\nd*<word-suffix-part><space-between-words>
// were getting broken up resulting in the <word-suffix-part> being parsed as a separate source phrase.
// We utilize any returned wordSuffix to update the m_key and m_srcPhrase members with to include this
// suffix.
wxString wordSuffix; wordSuffix.Empty();
ptr = ParsePostWordPunctsAndEndMkrs(ptr, pEnd, pSrcPhrase, itemLen, wordSuffix, spacelessPuncts);
len += itemLen;
// whm 26Dec2024 added. Use any non-empty wordSuffix parameter value as a suffix to the m_key and
// m_srcPhrase member so that that suffix part of the word doesn't get used to create a separate
// source phrase.
if (!wordSuffix.IsEmpty())
{
pSrcPhrase->m_key += wordSuffix;
pSrcPhrase->m_srcPhrase += wordSuffix;
int wordLen = wordSuffix.Length();
ptr += wordLen;
len += wordLen;
wordSuffix.Empty();
}
//
//wxLogDebug(_T("LEN+PTR line %d , m_markers= [%s], len %d , 20 at ptr= [%s]"), __LINE__, pSrcPhrase->m_markers.c_str(), len, wxString(ptr, 20).c_str());
{
// BEW 18Jul23, it's possible that the returned ptr is pointing at a beginMkr, like \f for instance.
Expand Down Expand Up @@ -47224,7 +47286,7 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB


#if defined (_DEBUG) //&& !defined(NOLOGS)
if (pSrcPhrase->m_nSequNumber == 99)
if (pSrcPhrase->m_nSequNumber == 5)
{
int break_here = 0; wxUnusedVar(break_here);
}
Expand Down Expand Up @@ -49263,7 +49325,17 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB
// any puncts that immediately follow the filtered info that really should be stored on
// pPrevSrcPhrase. Note that we pass pPrevSrcPhrase here into the function - the function
// now internally protects from pPrevSrcPhrase being NULL.
ptr = ParsePostWordPunctsAndEndMkrs(ptr, pEnd, pPrevSrcPhrase, itemLen, spacelessPuncts);
//
// whm 26Dec2024 added wordSuffix ref parameter to ParsePostWordPunctsAndEndMkrs() call to cover
// the situation where an inline binding end marker such as \nd* occurs in the middle of a word.
// The wordSuffix ref parameter will return any such word-building suffix part of the word that
// occurs directly after the inline binding end marker. Otherwise, as Roland F reported some of
// his words that had, for example, \nd<word-first-part>\nd*<word-suffix-part><space-between-words>
// were getting broken up resulting in the <word-suffix-part> being parsed as a separate source phrase.
// We utilize any returned wordSuffix to update the m_key and m_srcPhrase members with this suffix.
wxString wordSuffix; wordSuffix.Empty();
ptr = ParsePostWordPunctsAndEndMkrs(ptr, pEnd, pPrevSrcPhrase, itemLen, wordSuffix, spacelessPuncts);
//ptr = ParsePostWordPunctsAndEndMkrs(ptr, pEnd, pPrevSrcPhrase, itemLen, spacelessPuncts);

// We don't know what lies beyond the endmarker, but it's handled further down.
// if after some whitespace there is a backslash, then continue; to iterate the loop.
Expand Down Expand Up @@ -51468,7 +51540,17 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB
// any puncts that immediately follow the filtered info that really should be stored on
// pPrevSrcPhrase. Note that we pass pPrevSrcPhrase here into the function, and internally
// ParsePostWordPunctsAndEndMkrs() has protection against pPrevSrcPhrase being NULL.
ptr = ParsePostWordPunctsAndEndMkrs(ptr, pEnd, pPrevSrcPhrase, itemLen, spacelessPuncts);
//
// whm 26Dec2024 added wordSuffix ref parameter to ParsePostWordPunctsAndEndMkrs() call to cover
// the situation where an inline binding end marker such as \nd* occurs in the middle of a word.
// The wordSuffix ref parameter will return any such word-building suffix part of the word that
// occurs directly after the inline binding end marker. Otherwise, as Roland F reported some of
// his words that had, for example, \nd<word-first-part>\nd*<word-suffix-part><space-between-words>
// were getting broken up resulting in the <word-suffix-part> being parsed as a separate source phrase.
// We can utilize any returned wordSuffix to update the m_key and m_srcPhrase members with this suffix.
wxString wordSuffix; wordSuffix.Empty();
ptr = ParsePostWordPunctsAndEndMkrs(ptr, pEnd, pPrevSrcPhrase, itemLen, wordSuffix, spacelessPuncts);
//ptr = ParsePostWordPunctsAndEndMkrs(ptr, pEnd, pPrevSrcPhrase, itemLen, spacelessPuncts);

// We don't know what lies beyond the endmarker, but it's handled further down.
// if after some whitespace there is a backslash, then continue; to iterate the loop.
Expand Down Expand Up @@ -51977,7 +52059,7 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB
//wxLogDebug(_T("TokText(), line %d : sequNum = %d , m_srcPhrase = [%s] , m_curTextType = %d, m_key = [%s] , m_markers= [%s], at=[%s]"), __LINE__,
// (int)pSrcPhrase->m_nSequNumber, pSrcPhrase->m_srcPhrase.c_str(), (int)pSrcPhrase->m_curTextType,
// pSrcPhrase->m_key.c_str(), pSrcPhrase->m_markers.c_str(), ptrPointsAt.c_str());
if (pSrcPhrase->m_nSequNumber >= 28) // whm break
if (pSrcPhrase->m_nSequNumber >= 5) // whm break
{
int halt_here = 1; wxUnusedVar(halt_here);
}
Expand Down Expand Up @@ -52025,7 +52107,7 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB
wxLogDebug(_T("TokText(), line %d, sequNum= %d , m_bSpecialText= %d , m_curTextType= %d, m_key= [%s], m_precPunct= [%s] , ptrPointsAt= [%s]"),
__LINE__, (int)pSrcPhrase->m_nSequNumber, (int)pSrcPhrase->m_bSpecialText, (int)pSrcPhrase->m_curTextType,
pSrcPhrase->m_key.c_str(), pSrcPhrase->m_precPunct.c_str(), ptrPointsAt.c_str());
if (pSrcPhrase->m_nSequNumber >= 2)
if (pSrcPhrase->m_nSequNumber >= 5)
{
int halt_here = 1; wxUnusedVar(halt_here);
}
Expand Down Expand Up @@ -52237,6 +52319,7 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB
// no residue as the matched ones are removed. Setting it is only do-able here, so once its done, it
// needs to stay unchanged. (It's updateable perhaps if the user manually makes puncts changes.)
wxString strWordAndExtras = wxEmptyString;
wxString strWordAndExtrasPlusIBEM = wxEmptyString; // whm 26Dec2024 added
if (itemLen > 0 && bTokenizingTargetText == FALSE && pSrcPhrase->m_srcSinglePattern.IsEmpty())
{
// Here we handle setting m_srcSinglePattern, for the text being source text.
Expand All @@ -52248,7 +52331,9 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB
// the MakeWordAndExtras() to detect if ptr is pointing at a begin marker and, if so, skip that
// marker when returning its processed string to strWordAndExtras below.
strWordAndExtras = MakeWordAndExtras(ptr, itemLen);
strWordAndExtrasPlusIBEM = strWordAndExtras; // if inlineBindingEndMarker is present preserve it here
strWordAndExtras.Trim(); // remove any final whitespace(s)
strWordAndExtrasPlusIBEM.Trim(); // '' '' '' ''
// BEW 3May23, I get the wanted values, for each pSrcPhrase, but need a storage location on pSrcPhrase.
// 4May23 I've implemented document VERSION_NUMBER 10
//
Expand All @@ -52263,6 +52348,47 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB
//if (docVersion >= 10 && pSrcPhrase->m_nSrcWords < 2)
if (pSrcPhrase->m_nSrcWords < 2)
{
// whm 26Dec2024 added. Check if ParseWord() processed an inline binding end marker that
// got left embedded in the middle of the strWordAndExtras. This could happen as with
// Roland F's data which could have something like " Evezøza\\nd*-qa" where the -qa is
// actually the last part/suffix of the word being parsed, and it has an embedded \nd*
// inline binding end marker embedded within it.
// ParseWord()'s call of ParsePostWordPunctsAndEndMkrs() stored the inline binding end marker
// within pSrcPhrase's m_inlindBindingEndMarkers member, but we may need to remove the
// inline binding end marker from the strWordAndExtras string itself here, since whenever
// a word has an inline binding end marker embedded within the word so that the word has
// a suffix of word building chars following the inline nonbinding end marker, it will
// remain within the strWordAndExtras at this point. We check for this possibility here
// and remove it if present in strWordAndExtras.
// The ParseWord() function also returns a value of TRUE in its bIsInlineBindingMkr ref
// parameter when it has processed a word involving an inline binding marker/end marker.
if (bIsInlineBindingMkr)
{
wxString mkr; mkr.Empty();
wxString endMkrStr; endMkrStr.Empty();
// The possible inlineBindingEndMarkers are:
// _T("\\add* \\bk* \\dc* \\k* \\lit* \\nd* \\ord* \\pn* \\sig* \\em* \\bd* \\it*
// \\fk* \\bdit* \\no* \\sc* \\pb* \\ndx* \\pro* \\w* \\wg* \\wh* \\qs* \\+add* \\+bk*
// \\+dc* \\+k* \\+lit* \\+nd* \\+ord* \\+pn* \\+sig* \\+em* \\+bd* \\+it* \\+bdit*
// \\+no* \\+sc* \\+pb* \\+ndx* \\+pro* \\+w* \\+wg* \\+wh* \\+qs* \\cat* ")
// which are stored in the App's m_inlineBindingEndMarkers set.
wxArrayString mkrsArr; mkrsArr.Empty();
wxString inlineBindingEndMkrs = pSrcPhrase->GetInlineBindingEndMarkers();
mkrsArr.Empty();
GetMarkersAndEndMarkersFromString(&mkrsArr, inlineBindingEndMkrs, endMkrStr);
int nTot = (int)mkrsArr.GetCount();
for (int i = 0; i < nTot; i++)
{
mkr = mkrsArr.Item(i);
mkr.Trim(); // do not use augmented marker form here
// if the strWordAndExtras has the mkr, replace it with empty string
if (strWordAndExtras.Find(mkr) != wxNOT_FOUND)
{
strWordAndExtras.Replace(mkr, wxEmptyString);
}
}
}
//
// whm 18Feb2024 added. Check if ParseWord() processed an old bar code
if (bProcessedOldBarCode)
{
Expand Down Expand Up @@ -52348,7 +52474,19 @@ int CAdapt_ItDoc::TokenizeText(int nStartingSequNum, SPList* pList, wxString& rB
}

// It's not a merger, so store in pSrcPhrase's new m_srcSinglePattern member
pSrcPhrase->m_srcSinglePattern = strWordAndExtras; // the pSrcPhrase->m_key value will start the string
//
// whm 26Dec2024 modification. We should preserve in m_srcSinglePattern any form of the ParseWord()
// function call that has an embedded inline binding end marker within it. This may make it possible
// to reconstruct the correct position of an embedded inline binding end marker when exporting the
// source text.
if (bIsInlineBindingMkr)
{
pSrcPhrase->m_srcSinglePattern = strWordAndExtrasPlusIBEM; // may have an embedded inlineBindingEndMarker
}
else
{
pSrcPhrase->m_srcSinglePattern = strWordAndExtras; // the pSrcPhrase->m_key value will start the string
}
// BEW 10May23, exporting src text will call RebuildSourceText() and for user-edited m_key value we need
// to have available the m_oldKey value (for comparison) so as to know if an edit of m_srcSinglePattern
// is needed in the rebuild. Today I added support for pSrcPhrase->m_oldKey to the document's xml.
Expand Down
4 changes: 3 additions & 1 deletion source/Adapt_ItDoc.h
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,9 @@ bool bMarkAsDirty); // might want it instantly saveable

// BEW 7Jun23 created next, for parsing final puncts, which may be all or some detached by preceding
// whitespace(s), and getting to the puncts may require parsing over one or more endEndMarkers
wxChar* ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd, CSourcePhrase* pSrcPhrase, int& itemLen, wxString spacelessPuncts);
//wxChar* ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd, CSourcePhrase* pSrcPhrase, int& itemLen, wxString spacelessPuncts);
wxChar* ParsePostWordPunctsAndEndMkrs(wxChar* pChar, wxChar* pEnd, CSourcePhrase* pSrcPhrase,
int& itemLen, wxString& wordSuffix, wxString spacelessPuncts);
bool IsGenuineFollPunct(wxChar chPunct); // BEW 7Jun23 created, for use in ParsePostWordPuncts() in ParseWord()
void CountGoodAndBadEndPuncts(wxString strEndPuncts, int& nGood, int& nBad); // BEW 7Jun23 created, for use in ParsePostWordPuncts()
// BEW added ParseDate 16Jun23 for parsing data like 02/26/01 or 02/26/2001, or 2010/05/24 , or 12/02 etc.
Expand Down
7 changes: 4 additions & 3 deletions source/ExportFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17212,9 +17212,10 @@ wxString AppendSrcPhraseEndingInfo(wxString appendHere, CSourcePhrase* pSrcPhras
// the placement of the m_follOuterPunct. So, I'm checking to see if the stuff that
// follows the m_key part of m_srcSinglePattern m_srcPhrase + appendHere
// value at this point is equal to the m_srcSinglePattern or not. If not, I'm
// extracting the m_srcPhrase from m_srcSinglePattern, differes from what has been
// composed above for appendHere. If they are not the same, I'm substituting what
// follows the m_key in m_srcSinglePattern in place of the appendHere value.
// extracting the m_srcPhrase from m_srcSinglePattern, to see if it differs from
// what has been composed above for appendHere. If they are not the same, I'm
// substituting what follows the m_key in m_srcSinglePattern in place of the appendHere
// value.
// Testing results indicate that this improves the exports of source texts.
if (!appendHere.IsEmpty() && !pSrcPhrase->m_srcSinglePattern.IsEmpty())
{
Expand Down
Loading

0 comments on commit 908292f

Please sign in to comment.