diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx index 94e2564..9c59ce4 100644 --- a/sw/source/filter/ww8/ww8par.cxx +++ b/sw/source/filter/ww8/ww8par.cxx @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,7 @@ #include #include +#include #include #include @@ -154,6 +156,8 @@ using namespace nsHdFtFlags; #include #include +#include +#include #include #include #include @@ -2622,7 +2626,262 @@ bool SwWW8ImplReader::ReadPlainChars(WW8_CP& rPos, long nEnd, long nCpOfs) return nL2 >= nLen; } -//TODO: In writer we categorize text into CJK, CTL and "Western" for everything +/* + Character type | Font (ftc) | Language (lid) + ASCII | sprmCRgftc0 | sprmCRglid0 + non-East Asian | sprmCRgftc2 | sprmCRglid0 + East Asian | sprmCRgftc1 | sprmCRglid1 + shared character | sprmCRgftc2 if chp.idctHint==0 | sprmCRglid0 if chp.idctHint==0 + | sprmCRgftc1 if chp.idctHint==1 | sprmCRglid1 if chp.idctHint==1 +*/ + +enum MSScriptType +{ + ASCII = i18n::ScriptType::LATIN, + EastAsian = i18n::ScriptType::ASIAN, + NonEastAsian = i18n::ScriptType::COMPLEX, + Shared = i18n::ScriptType::WEAK +}; + +MSScriptType categorizeCharByMSScript(sal_Unicode cChar) +{ + if (cChar >= 0x20 && cChar <= 0x7f) //usrBasicLatin + return MSScriptType::ASCII; + if (cChar >= 0xa0 && cChar <= 0xff) //usrLatin1 + { + switch (cChar) + { + case 0xa1: + case 0xa4: + case 0xa7: + case 0xa8: + case 0xaa: + case 0xad: + case 0xaf: + case 0xb0: + case 0xb1: + case 0xb2: + case 0xb3: + case 0xb4: + case 0xb6: + case 0xb7: + case 0xb8: + case 0xb9: + case 0xba: + case 0xbc: + case 0xbd: + case 0xbe: + case 0xbf: + case 0xd7: + case 0xf7: + return MSScriptType::Shared; + default: + return MSScriptType::NonEastAsian; + } + } + if (cChar >= 0x100 && cChar <= 0x17f) //usrLatinXA + { + switch (cChar) + { + case 0x100: + case 0x101: + case 0x113: + case 0x11b: + case 0x12b: + case 0x144: + case 0x148: + case 0x14d: + case 0x16b: + return MSScriptType::Shared; + default: + return MSScriptType::NonEastAsian; + } + + } + if (cChar >= 0x180 && cChar <= 0x24f) //usrLatinXB + { + switch (cChar) + { + case 0x192: + case 0x1fa: + case 0x1fb: + case 0x1fc: + case 0x1fd: + case 0x1fe: + return MSScriptType::Shared; + default: + return MSScriptType::NonEastAsian; + } + + } + if (cChar >= 0x250 && cChar <= 0x3af) //usrIPAExtensions + { + switch (cChar) + { + case 0x251: + case 0x261: + return MSScriptType::Shared; + default: + return MSScriptType::NonEastAsian; + } + } + if (cChar >= 0x2b0 && cChar <= 0x2ff) //usrSpacingModLetters + return MSScriptType::Shared; + if (cChar >= 0x300 && cChar <= 0x36f) //usrCombDiacritical + return MSScriptType::Shared; + if (cChar >= 0x370 && cChar <= 0x3cf) //usrBasicGreek + return MSScriptType::Shared; + if (cChar >= 0x400 && cChar <= 0x4ff) //usrCyrillic + return MSScriptType::Shared; + if (cChar >= 0x1e00 && cChar <= 0x1eff) //usrLatinExtendedAdd + return MSScriptType::Shared; + if (cChar >= 0x2000 && cChar <= 0x2065) //usrGeneralPunct + return MSScriptType::Shared; + if (cChar >= 0x2070 && cChar <= 0x209f) //usrSuperAndSubscript + return MSScriptType::Shared; + if (cChar >= 0x20a0 && cChar <= 0x20cf) //usrCurrencySymbols + return MSScriptType::Shared; + if (cChar >= 0x20d0 && cChar <= 0x20ff) //usrCombDiacriticsS + return MSScriptType::Shared; + if (cChar >= 0x2100 && cChar <= 0x214f) //usrLetterlikeSymbols + return MSScriptType::Shared; + if (cChar >= 0x2150 && cChar <= 0x218f) //usrNumberForms + return MSScriptType::Shared; + if (cChar >= 0x2190 && cChar <= 0x21ff) //usrArrows + return MSScriptType::Shared; + if (cChar >= 0x2200 && cChar <= 0x22ff) //usrMathematicalOps + return MSScriptType::Shared; + if (cChar >= 0x2300 && cChar <= 0x23ff) //usrMiscTechnical + return MSScriptType::Shared; + if (cChar >= 0x2400 && cChar <= 0x243f) //usrControlPictures + return MSScriptType::Shared; + if (cChar >= 0x2440 && cChar <= 0x245f) //usrOpticalCharRecog + return MSScriptType::Shared; + if (cChar >= 0x2460 && cChar <= 0x24ff) //usrEnclosedAlphanum + return MSScriptType::Shared; + if (cChar >= 0x2500 && cChar <= 0x257f) //usrBoxDrawing + return MSScriptType::Shared; + if (cChar >= 0x2580 && cChar <= 0x259f) //usrBlockElements + return MSScriptType::Shared; + if (cChar >= 0x25a0 && cChar <= 0x25ff) //usrGeometricShapes + return MSScriptType::Shared; + if (cChar >= 0x2600 && cChar <= 0x26ff) //usrMiscDingbats + return MSScriptType::Shared; + if (cChar >= 0x2700 && cChar <= 0x27bf) //usrDingbats + return MSScriptType::Shared; + if (cChar >= 0x3000 && cChar <= 0x303f) //usrCJKSymAndPunct + return MSScriptType::EastAsian; + if (cChar >= 0x3040 && cChar <= 0x309f) //usrHiragana + return MSScriptType::EastAsian; + if (cChar >= 0x30a0 && cChar <= 0x30ff) //usrKatakana + return MSScriptType::EastAsian; + if (cChar >= 0x3100 && cChar <= 0x312f) //usrBopomofo + return MSScriptType::EastAsian; + if (cChar >= 0x3130 && cChar <= 0x318f) //usrHangulCompatJamo + return MSScriptType::EastAsian; + if (cChar >= 0x3190 && cChar <= 0x319f) //usrCJKMisc + return MSScriptType::EastAsian; + if (cChar >= 0x3200 && cChar <= 0x32ff) //usrEnclosedCJKLtMnth + return MSScriptType::EastAsian; + if (cChar >= 0x3300 && cChar <= 0x33ff) //usrCJKCompatibility + return MSScriptType::EastAsian; + if (cChar >= 0x4a00 && cChar <= 0x4dff) //usrCJKCompatibility + return MSScriptType::EastAsian; + if (cChar >= 0x4e00 && cChar <= 0x9fff) //usrCJKUnifiedIdeo + return MSScriptType::EastAsian; + if (cChar >= 0xac00 && cChar <= 0xd7a3) //usrHangul + return MSScriptType::EastAsian; + if (cChar >= 0xe000 && cChar <= 0xf8ff) //usrPrivateUseArea + return MSScriptType::Shared; + if (cChar >= 0xf900 && cChar <= 0xfaff) //usrCJKCompatibilityIdeographs + return MSScriptType::EastAsian; + if (cChar >= 0xfb00 && cChar <= 0xfb4f) //usrAlphaPresentationForms + return MSScriptType::Shared; + if (cChar >= 0xfb50 && cChar <= 0xfdff) //usrArabicPresentationFormsA + return MSScriptType::Shared; + if (cChar >= 0xfe20 && cChar <= 0xfe2f) //usrCombiningHalfMarks + return MSScriptType::EastAsian; + if (cChar >= 0xfe30 && cChar <= 0xfe4f) //usrCJKCompatForms + return MSScriptType::EastAsian; + if (cChar >= 0xfe50 && cChar <= 0xfe6f) //usrSmallFormVariants + return MSScriptType::EastAsian; + if (cChar >= 0xfe70 && cChar <= 0xfefe) //usrArabicPresentationFormsB + return MSScriptType::Shared; + if (cChar >= 0xff00 && cChar <= 0xffef) //usrHFWidthForms + return MSScriptType::EastAsian; + return MSScriptType::NonEastAsian; +} + +void SwWW8ImplReader::emulateMSWordAddChunkToParagraph(const rtl::OUString& rAddString, + sal_uInt16 nLibreOfficeScript, sal_uInt16 nMSOfficeScript) +{ + if (nMSOfficeScript == MSScriptType::Shared) + { + if (nIdctHint == 0) + nMSOfficeScript = MSScriptType::NonEastAsian; + else if (nIdctHint == 1) + nMSOfficeScript = MSScriptType::EastAsian; + } + bool bForceProperties = (nMSOfficeScript != nLibreOfficeScript); + sal_Int16 nLibreOfficeId; + if (bForceProperties) + { + //This is the ID that LibreOffice will use for the text + switch (nLibreOfficeScript) + { + case i18n::ScriptType::LATIN: + default: + nLibreOfficeId = RES_CHRATR_FONT; + break; + case i18n::ScriptType::ASIAN: + nLibreOfficeId = RES_CHRATR_CJK_FONT; + break; + case i18n::ScriptType::COMPLEX: + nLibreOfficeId = RES_CHRATR_CTL_FONT; + break; + } + //This is the ID that contains the properties that MSWord + //would use + sal_Int16 nForceLibreOfficeId; + switch (nMSOfficeScript) + { + case MSScriptType::ASCII: + default: + nForceLibreOfficeId = RES_CHRATR_FONT; + break; + case MSScriptType::EastAsian: + nForceLibreOfficeId = RES_CHRATR_CJK_FONT; + break; + case MSScriptType::NonEastAsian: + nForceLibreOfficeId = RES_CHRATR_CTL_FONT; + break; + } + + const SvxFontItem *pSourceFont = (const SvxFontItem*)GetFmtAttr(nForceLibreOfficeId); + const SvxFontItem *pDestFont = (const SvxFontItem*)GetFmtAttr(nLibreOfficeId); + + if (pSourceFont && pDestFont) + { + //They're the same anyway, great, skip forcing + bForceProperties = *pSourceFont != *pDestFont; + } + if (pSourceFont && bForceProperties) + { + SvxFontItem aForceFont(*pSourceFont); + aForceFont.SetWhich(nLibreOfficeId); + pCtrlStck->NewAttr(*pPaM->GetPoint(), aForceFont); + } + else + bForceProperties = false; + } + + simpleAddTextToParagraph(rAddString); + + if (bForceProperties) + pCtrlStck->SetAttr(*pPaM->GetPoint(), nLibreOfficeId); +} + +//In writer we categorize text into CJK, CTL and "Western" for everything //else. Microsoft Word basically categorizes text into East Asian, Non-East //Asian and ASCII, with some shared characters and some properties to //to hint as to which way to bias those shared characters. @@ -2635,12 +2894,46 @@ bool SwWW8ImplReader::ReadPlainChars(WW8_CP& rPos, long nEnd, long nCpOfs) //we're then forced (because we don't have an equivalent hint) to mirror the //properties of the source MSWord category into the properties of the dest //Writer category for that range of text in order to get the right results. -bool SwWW8ImplReader::emulateMSWordAddTextToParagraph(const String& rAddString) +void SwWW8ImplReader::emulateMSWordAddTextToParagraph(const rtl::OUString& rAddString) { - return simpleAddTextToParagraph(rAddString); + if (!rAddString.getLength()) + return; + + uno::Reference xBI(pBreakIt->GetBreakIter()); + if (!xBI.is()) + { + simpleAddTextToParagraph(rAddString); + return; + } + + rtl::OUStringBuffer sChunk; + const sal_Unicode *pChar = rAddString.getStr(); + sal_Int32 i = 0;; + sal_uInt16 nLibreOfficeScript = xBI->getScriptType(rAddString, i++); + MSScriptType nMSOfficeScript = categorizeCharByMSScript(*pChar); + sChunk.append(*pChar++); + while (i < rAddString.getLength()) + { + sal_uInt16 nNextLibreOfficeScript = xBI->getScriptType(rAddString, i++); + MSScriptType nNextMSOfficeScript = categorizeCharByMSScript(*pChar); + if ( + sChunk.getLength() && + ((nNextLibreOfficeScript != nLibreOfficeScript) || (nNextMSOfficeScript != nMSOfficeScript)) + ) + { + emulateMSWordAddChunkToParagraph(sChunk.makeStringAndClear(), nMSOfficeScript, nLibreOfficeScript); + } + sChunk.append(*pChar); + nLibreOfficeScript = nNextLibreOfficeScript; + nMSOfficeScript = nNextMSOfficeScript; + ++pChar; + } + + if (sChunk.getLength()) + emulateMSWordAddChunkToParagraph(sChunk.makeStringAndClear(), nMSOfficeScript, nLibreOfficeScript); } -bool SwWW8ImplReader::simpleAddTextToParagraph(const String& rAddString) +void SwWW8ImplReader::simpleAddTextToParagraph(const String& rAddString) { const SwTxtNode* pNd = pPaM->GetCntntNode()->GetTxtNode(); if (rAddString.Len()) @@ -2681,8 +2974,6 @@ bool SwWW8ImplReader::simpleAddTextToParagraph(const String& rAddString) bReadTable = false; } - - return true; } // Returnwert: true for para end @@ -3348,7 +3639,7 @@ SwWW8ImplReader::SwWW8ImplReader(BYTE nVersionPara, SvStorage* pStorage, m_bRegardHindiDigits( false ), mbNewDoc(bNewDoc), nDropCap(0), - nIdctHint(0), + nIdctHint(0xFF), bBidi(false), bReadTable(false) { diff --git a/sw/source/filter/ww8/ww8par.hxx b/sw/source/filter/ww8/ww8par.hxx index 4be96b7..c2d4993 100644 --- a/sw/source/filter/ww8/ww8par.hxx +++ b/sw/source/filter/ww8/ww8par.hxx @@ -1117,8 +1117,10 @@ private: pReffingStck = 0; } void DeleteAnchorStk() { DeleteStk( pAnchorStck ); pAnchorStck = 0; } - bool emulateMSWordAddTextToParagraph(const String& sAddString); - bool simpleAddTextToParagraph(const String& sAddString); + void emulateMSWordAddTextToParagraph(const rtl::OUString& rAddString); + void emulateMSWordAddChunkToParagraph(const rtl::OUString& rAddString, + sal_uInt16 nLibreOfficeScript, sal_uInt16 nMSOfficeScript); + void simpleAddTextToParagraph(const String& sAddString); bool HandlePageBreakChar(); bool ReadChar(long nPosCp, long nCpOfs); bool ReadPlainChars(WW8_CP& rPos, long nEnd, long nCpOfs); diff --git a/sw/source/filter/ww8/ww8par6.cxx b/sw/source/filter/ww8/ww8par6.cxx index a9cf3b0..6020d2a 100644 --- a/sw/source/filter/ww8/ww8par6.cxx +++ b/sw/source/filter/ww8/ww8par6.cxx @@ -4312,7 +4312,7 @@ void SwWW8ImplReader::Read_UL( USHORT nId, const BYTE* pData, short nLen ) void SwWW8ImplReader::Read_IdctHint( USHORT, const BYTE* pData, short nLen ) { if (nLen < 0) - nIdctHint = 0; + nIdctHint = 0xFF; else nIdctHint = *pData; }