libclamunrar/unicode.cpp
d39cb658
 #include "rar.hpp"
 #define MBFUNCTIONS
 
 #if defined(_UNIX) && defined(MBFUNCTIONS)
 
 static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
 static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
 
 // In Unix we map high ASCII characters which cannot be converted to Unicode
 // to 0xE000 - 0xE0FF private use Unicode area.
 static const uint MapAreaStart=0xE000;
 
 // Mapped string marker. Initially we used 0xFFFF for this purpose,
 // but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
 // While we could workaround it, it is safer to use another character.
 static const uint MappedStringMark=0xFFFE;
 
 #endif
 
 bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
 {
   bool RetCode=true;
   *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
 
 #ifdef _WIN_ALL
   if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
     RetCode=false;
 
 // wcstombs is broken in Android NDK r9.
 #elif defined(_APPLE)
   WideToUtf(Src,Dest,DestSize);
 
 #elif defined(MBFUNCTIONS)
   if (!WideToCharMap(Src,Dest,DestSize,RetCode))
   {
     mbstate_t ps; // Use thread safe external state based functions.
     memset (&ps, 0, sizeof(ps));
     const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
 
     // Some implementations of wcsrtombs can cause memory analyzing tools
     // like valgrind to report uninitialized data access. It happens because
     // internally these implementations call SSE4 based wcslen function,
     // which reads 16 bytes at once including those beyond of trailing 0.
     size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
 
     if (ResultingSize==(size_t)-1 && errno==EILSEQ)
     {
       // Aborted on inconvertible character not zero terminating the result.
       // EILSEQ helps to distinguish it from small output buffer abort.
       // We want to convert as much as we can, so we clean the output buffer
       // and repeat conversion.
       memset (&ps, 0, sizeof(ps));
       SrcParam=Src; // wcsrtombs can change the pointer.
       memset(Dest,0,DestSize);
       ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
     }
 
     if (ResultingSize==(size_t)-1)
       RetCode=false;
     if (ResultingSize==0 && *Src!=0)
       RetCode=false;
   }
 #else
   for (int I=0;I<DestSize;I++)
   {
     Dest[I]=(char)Src[I];
     if (Src[I]==0)
       break;
   }
 #endif
   if (DestSize>0)
     Dest[DestSize-1]=0;
   
   // We tried to return the empty string if conversion is failed,
   // but it does not work well. WideCharToMultiByte returns 'failed' code
   // and partially converted string even if we wanted to convert only a part
   // of string and passed DestSize smaller than required for fully converted
   // string. Such call is the valid behavior in RAR code and we do not expect
   // the empty string in this case.
 
   return RetCode;
 }
 
 
 bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
 {
   bool RetCode=true;
   *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
 
 #ifdef _WIN_ALL
   if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
     RetCode=false;
 
 // mbstowcs is broken in Android NDK r9.
 #elif defined(_APPLE)
   UtfToWide(Src,Dest,DestSize);
 
 #elif defined(MBFUNCTIONS)
   mbstate_t ps;
   memset (&ps, 0, sizeof(ps));
   const char *SrcParam=Src; // mbsrtowcs can change the pointer.
   size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
   if (ResultingSize==(size_t)-1)
     RetCode=false;
   if (ResultingSize==0 && *Src!=0)
     RetCode=false;
 
   if (RetCode==false && DestSize>1)
     CharToWideMap(Src,Dest,DestSize,RetCode);
 #else
   for (int I=0;I<DestSize;I++)
   {
     Dest[I]=(wchar_t)Src[I];
     if (Src[I]==0)
       break;
   }
 #endif
   if (DestSize>0)
     Dest[DestSize-1]=0;
 
   // We tried to return the empty string if conversion is failed,
   // but it does not work well. MultiByteToWideChar returns 'failed' code
   // even if we wanted to convert only a part of string and passed DestSize
   // smaller than required for fully converted string. Such call is the valid
   // behavior in RAR code and we do not expect the empty string in this case.
 
   return RetCode;
 }
 
 
 #if defined(_UNIX) && defined(MBFUNCTIONS)
 // Convert and restore mapped inconvertible Unicode characters. 
 // We use it for extended ASCII names in Unix.
 bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
 {
   // String with inconvertible characters mapped to private use Unicode area
   // must have the mark code somewhere.
   if (wcschr(Src,(wchar)MappedStringMark)==NULL)
     return false;
 
   Success=true;
   uint SrcPos=0,DestPos=0;
   while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
   {
     if (uint(Src[SrcPos])==MappedStringMark)
     {
       SrcPos++;
       continue;
     }
     // For security reasons do not restore low ASCII codes, so mapping cannot
     // be used to hide control codes like path separators.
     if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
       Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
     else
     {
       mbstate_t ps;
       memset(&ps,0,sizeof(ps));
       if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
       {
         Dest[DestPos]='_';
         Success=false;
       }
       SrcPos++;
       memset(&ps,0,sizeof(ps));
       int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
       DestPos+=Max(Length,1);
     }
   }
   Dest[Min(DestPos,DestSize-1)]=0;
   return true;
 }
 #endif
 
 
 #if defined(_UNIX) && defined(MBFUNCTIONS)
 // Convert and map inconvertible Unicode characters. 
 // We use it for extended ASCII names in Unix.
 void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
 {
   // Map inconvertible characters to private use Unicode area 0xE000.
   // Mark such string by placing special non-character code before
   // first inconvertible character.
   Success=false;
   bool MarkAdded=false;
   uint SrcPos=0,DestPos=0;
   while (DestPos<DestSize)
   {
     if (Src[SrcPos]==0)
     {
       Success=true;
       break;
     }
     mbstate_t ps;
     memset(&ps,0,sizeof(ps));
     size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
     if (res==(size_t)-1 || res==(size_t)-2)
     {
       // For security reasons we do not want to map low ASCII characters,
       // so we do not have additional .. and path separator codes.
       if (byte(Src[SrcPos])>=0x80)
       {
         if (!MarkAdded)
         {
           Dest[DestPos++]=MappedStringMark;
           MarkAdded=true;
           if (DestPos>=DestSize)
             break;
         }
         Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
       }
       else
         break;
     }
     else
     {
       memset(&ps,0,sizeof(ps));
       int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
       SrcPos+=Max(Length,1);
       DestPos++;
     }
   }
   Dest[Min(DestPos,DestSize-1)]=0;
 }
 #endif
 
 
 // SrcSize is in wide characters, not in bytes.
 byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
 {
   for (size_t I=0;I<SrcSize;I++,Src++)
   {
     Dest[I*2]=(byte)*Src;
     Dest[I*2+1]=(byte)(*Src>>8);
     if (*Src==0)
       break;
   }
   return Dest;
 }
 
 
 wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
 {
   for (size_t I=0;I<DestSize;I++)
     if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
       break;
   return Dest;
 }
 
 
 void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
 {
   long dsize=(long)DestSize;
   dsize--;
   while (*Src!=0 && --dsize>=0)
   {
     uint c=*(Src++);
     if (c<0x80)
       *(Dest++)=c;
     else
       if (c<0x800 && --dsize>=0)
       {
         *(Dest++)=(0xc0|(c>>6));
         *(Dest++)=(0x80|(c&0x3f));
       }
       else
       {
         if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
         {
           c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
           Src++;
         }
         if (c<0x10000 && (dsize-=2)>=0)
         {
           *(Dest++)=(0xe0|(c>>12));
           *(Dest++)=(0x80|((c>>6)&0x3f));
           *(Dest++)=(0x80|(c&0x3f));
         }
         else
           if (c < 0x200000 && (dsize-=3)>=0)
           {
             *(Dest++)=(0xf0|(c>>18));
             *(Dest++)=(0x80|((c>>12)&0x3f));
             *(Dest++)=(0x80|((c>>6)&0x3f));
             *(Dest++)=(0x80|(c&0x3f));
           }
       }
   }
   *Dest=0;
 }
 
 
 size_t WideToUtfSize(const wchar *Src)
 {
   size_t Size=0;
   for (;*Src!=0;Src++)
     if (*Src<0x80)
       Size++;
     else
       if (*Src<0x800)
         Size+=2;
       else
         if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
         {
           if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
           {
             Size+=4; // 4 output bytes for Unicode surrogate pair.
             Src++;
           }
           else
             Size+=3;
         }
         else
           if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
             Size+=4;
   return Size+1; // Include terminating zero.
 }
 
 
 bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
 {
   bool Success=true;
   long dsize=(long)DestSize;
   dsize--;
   while (*Src!=0)
   {
     uint c=byte(*(Src++)),d;
     if (c<0x80)
       d=c;
     else
       if ((c>>5)==6)
       {
         if ((*Src&0xc0)!=0x80)
         {
           Success=false;
           break;
         }
         d=((c&0x1f)<<6)|(*Src&0x3f);
         Src++;
       }
       else
         if ((c>>4)==14)
         {
           if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
           {
             Success=false;
             break;
           }
           d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
           Src+=2;
         }
         else
           if ((c>>3)==30)
           {
             if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
             {
               Success=false;
               break;
             }
             d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
             Src+=3;
           }
           else
           {
             Success=false;
             break;
           }
     if (--dsize<0)
       break;
     if (d>0xffff)
     {
       if (--dsize<0)
         break;
       if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
       {
         Success=false;
         continue;
       }
       if (sizeof(*Dest)==2) // Use the surrogate pair.
       {
         *(Dest++)=((d-0x10000)>>10)+0xd800;
         *(Dest++)=(d&0x3ff)+0xdc00;
       }
       else
         *(Dest++)=d;
     }
     else
       *(Dest++)=d;
   }
   *Dest=0;
   return Success;
 }
 
 
 // For zero terminated strings.
 bool IsTextUtf8(const byte *Src)
 {
   return IsTextUtf8(Src,strlen((const char *)Src));
 }
 
 
 // Source data can be both with and without UTF-8 BOM.
 bool IsTextUtf8(const byte *Src,size_t SrcSize)
 {
   while (SrcSize-- > 0)
   {
     byte C=*(Src++);
     int HighOne=0; // Number of leftmost '1' bits.
     for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
       HighOne++;
     if (HighOne==1 || HighOne>6)
       return false;
     while (--HighOne > 0)
       if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
         return false;
   }
   return true;
 }
 
 
 int wcsicomp(const wchar *s1,const wchar *s2)
 {
 #ifdef _WIN_ALL
   return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
 #else
   while (true)
   {
     wchar u1 = towupper(*s1);
     wchar u2 = towupper(*s2);
     if (u1 != u2)
       return u1 < u2 ? -1 : 1;
     if (*s1==0)
       break;
     s1++;
     s2++;
   }
   return 0;
 #endif
 }
 
 
 int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
 {
 #ifdef _WIN_ALL
   // If we specify 'n' exceeding the actual string length, CompareString goes
   // beyond the trailing zero and compares garbage. So we need to limit 'n'
   // to real string length.
   size_t l1=Min(wcslen(s1)+1,n);
   size_t l2=Min(wcslen(s2)+1,n);
   return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
 #else
   if (n==0)
     return 0;
   while (true)
   {
     wchar u1 = towupper(*s1);
     wchar u2 = towupper(*s2);
     if (u1 != u2)
       return u1 < u2 ? -1 : 1;
     if (*s1==0 || --n==0)
       break;
     s1++;
     s2++;
   }
   return 0;
 #endif
 }
 
 
 const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
 {
   for (size_t i=0;str[i]!=0;i++)
     for (size_t j=0;;j++)
     {
       if (search[j]==0)
         return str+i;
       if (tolowerw(str[i+j])!=tolowerw(search[j]))
         break;
     }
   return NULL;
 }
 
 
 #ifndef SFX_MODULE
 wchar* wcslower(wchar *s)
 {
 #ifdef _WIN_ALL
   CharLower(s);
 #else
   for (wchar *c=s;*c!=0;c++)
     *c=towlower(*c);
 #endif
   return s;
 }
 #endif
 
 
 #ifndef SFX_MODULE
 wchar* wcsupper(wchar *s)
 {
 #ifdef _WIN_ALL
   CharUpper(s);
 #else
   for (wchar *c=s;*c!=0;c++)
     *c=towupper(*c);
 #endif
   return s;
 }
 #endif
 
 
 
 
 int toupperw(int ch)
 {
 #if defined(_WIN_ALL)
   // CharUpper is more reliable than towupper in Windows, which seems to be
   // C locale dependent even in Unicode version. For example, towupper failed
   // to convert lowercase Russian characters.
   return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)ch);
 #else
   return towupper(ch);
 #endif
 }
 
 
 int tolowerw(int ch)
 {
 #if defined(_WIN_ALL)
   // CharLower is more reliable than towlower in Windows.
   // See comment for towupper above.
   return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)ch);
 #else
   return towlower(ch);
 #endif
 }
 
 
 int atoiw(const wchar *s)
 {
   return (int)atoilw(s);
 }
 
 
 int64 atoilw(const wchar *s)
 {
   bool sign=false;
   if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
   {
     s++;
     sign=true;
   }
   // Use unsigned type here, since long string can overflow the variable
   // and signed integer overflow is undefined behavior in C++.
   uint64 n=0;
   while (*s>='0' && *s<='9')
   {
     n=n*10+(*s-'0');
     s++;
   }
   // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
   // when negating 0x8000000000000000.
   return sign && int64(n)>=0 ? -int64(n) : int64(n);
 }
 
 
 #ifdef DBCS_SUPPORTED
 SupportDBCS gdbcs;
 
 SupportDBCS::SupportDBCS()
 {
   Init();
 }
 
 
 void SupportDBCS::Init()
 {
   CPINFO CPInfo;
   GetCPInfo(CP_ACP,&CPInfo);
   DBCSMode=CPInfo.MaxCharSize > 1;
   for (uint I=0;I<ASIZE(IsLeadByte);I++)
     IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
 }
 
 
 char* SupportDBCS::charnext(const char *s)
 {
   // Zero cannot be the trail byte. So if next byte after the lead byte
   // is 0, the string is corrupt and we'll better return the pointer to 0,
   // to break string processing loops.
   return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
 }
 
 
 size_t SupportDBCS::strlend(const char *s)
 {
   size_t Length=0;
   while (*s!=0)
   {
     if (IsLeadByte[(byte)*s])
       s+=2;
     else
       s++;
     Length++;
   }
   return(Length);
 }
 
 
 char* SupportDBCS::strchrd(const char *s, int c)
 {
   while (*s!=0)
     if (IsLeadByte[(byte)*s])
       s+=2;
     else
       if (*s==c)
         return((char *)s);
       else
         s++;
   return(NULL);
 }
 
 
 void SupportDBCS::copychrd(char *dest,const char *src)
 {
   dest[0]=src[0];
   if (IsLeadByte[(byte)src[0]])
     dest[1]=src[1];
 }
 
 
 char* SupportDBCS::strrchrd(const char *s, int c)
 {
   const char *found=NULL;
   while (*s!=0)
     if (IsLeadByte[(byte)*s])
       s+=2;
     else
     {
       if (*s==c)
         found=s;
       s++;
     }
   return((char *)found);
 }
 #endif