mirror of git://gcc.gnu.org/git/gcc.git
407 lines
15 KiB
C++
407 lines
15 KiB
C++
/*
|
|
* Copyright (c) 2021-2025 Symas Corporation
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are
|
|
* met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following disclaimer
|
|
* in the documentation and/or other materials provided with the
|
|
* distribution.
|
|
* * Neither the name of the Symas Corporation nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef CHARMAPS_H
|
|
#define CHARMAPS_H
|
|
|
|
#include <unistd.h>
|
|
|
|
/* There are four distinct codeset domains in the COBOL compiler.
|
|
*
|
|
* First is the codeset of the console. Established by looking at what
|
|
* setlocale() reports, this can be either UTF-8 or some ASCII based code
|
|
* page. (We assume CP1252). Data coming from the console or the system,
|
|
* ACCEPT statements; redirected console input, getenv() and other system
|
|
* calls are in the "console" domain.
|
|
*
|
|
* Second is the internal single-byte-coded codeset of the data, in memory,
|
|
* being manipulated by the generated code of the cobol executable. The actual
|
|
* codeset of "internal" is either EBCDIC (in the form of Code Page 1140 or
|
|
* ASCII (Code Page 1252)
|
|
*
|
|
* Third is the C++ source code of the GCOBOL compiler; this comment is
|
|
* in that environment. We neither know, nor care, if this code is encoded in
|
|
* in UTF-8 (as is probable, in these enlighted days of 2022) or something like
|
|
* Code Page1252. We are going to regard it as "ascii" under the
|
|
* assumption that there is no reason for any character in the compiler's
|
|
* source code to have a code point outside of the plain vanilla 0x20 through
|
|
* 0x7F range.
|
|
*
|
|
* Fourth is the "raw" COBOL source code that is the input to the GCOBOL
|
|
* compiler. This domain can be either UTF-8 or something like CodePage1252.
|
|
* Which encoding is relevant; The literal string MOVE "<euro>1234" is seven
|
|
* bytes long in UTF-8, and five bytes long in CP1252. We start with an
|
|
* assumption that it is UTF-8 and switch to CP1252 upon encountering a byte
|
|
* sequence with values above 0x80 that can't be UTF-8. We have provision for
|
|
* forcing it to be one or the other. Codepoints in that domain are referenced
|
|
* as "raw". Codepoint in the "raw" domain don't last long; they are be
|
|
* converted to either "ascii" or "internal" early on, as necessary.
|
|
*/
|
|
|
|
|
|
/* Notes on character codesets:
|
|
|
|
This library is implemented to handle "native" codesets of either ASCII (in
|
|
the form of a single-byte-coded codeset like code page 1252) or EBCDIC (in
|
|
the form of a single-byte-coded codeset like code page 1140).
|
|
|
|
This C/C++ source code, however, is assumed to be an ASCII-based codeset,
|
|
so that a character constant like a space is assumed to encode as 0x20.
|
|
|
|
Furthermore, we assume that the codeset of the COBOL source code being
|
|
compiled is also ASCII-based, even if it is actually UTF-8. Said another
|
|
way, characters encoded between zero and 127 are regarded as ASCII.
|
|
|
|
This means that we are not going to try to compile EBCDIC COBOL source code;
|
|
any such will have to be externally converted to ASCII before feeding it
|
|
through this compiler on an ASCII based Linux system.
|
|
|
|
This situation is rife for confusion here in the source code for the
|
|
library.
|
|
|
|
To help reduce that confusion, we are going to eschew character constants
|
|
in the C/C++ source code. Instead, we use symbolic versions. In general,
|
|
"source_space" means 0x20, while "internal_space" will be either 0x20
|
|
when using the ASCII-based native codeset, or it will be 0x40 when using
|
|
the EBCDIC-based native codeset.
|
|
|
|
Maintaining one's sanity while learning and working with this C/C++ code
|
|
will require a firm grip on context. You'll have to keep track of whether
|
|
the character is being used to analyze the ASCII-based COBOL source, or
|
|
whether the character in question is part of the native COBOL cobol data
|
|
that is being analyzed or generated.
|
|
|
|
For example, when a PICTURE string has in it a source_nine, the generated
|
|
result in the variable is based on character_zero.
|
|
|
|
Stay alert! */
|
|
|
|
extern int __gg__decimal_point ;
|
|
extern int __gg__decimal_separator ;
|
|
extern int __gg__quote_character ;
|
|
extern int __gg__low_value_character ;
|
|
extern int __gg__high_value_character ;
|
|
extern char **__gg__currency_signs ;
|
|
extern int __gg__default_currency_sign;
|
|
extern char *__gg__ct_currency_signs[256]; // Compile-time currency signs
|
|
|
|
#define NULLCH ('\0')
|
|
#define DEGENERATE_HIGH_VALUE 0xFF
|
|
#define DEGENERATE_LOW_VALUE 0x00
|
|
|
|
#define ascii_A ((uint8_t)('A'))
|
|
#define ascii_B ((uint8_t)('B'))
|
|
#define ascii_C ((uint8_t)('C'))
|
|
#define ascii_D ((uint8_t)('D'))
|
|
#define ascii_E ((uint8_t)('E'))
|
|
#define ascii_F ((uint8_t)('F'))
|
|
#define ascii_G ((uint8_t)('G'))
|
|
#define ascii_H ((uint8_t)('H'))
|
|
#define ascii_I ((uint8_t)('I'))
|
|
#define ascii_J ((uint8_t)('J'))
|
|
#define ascii_K ((uint8_t)('K'))
|
|
#define ascii_L ((uint8_t)('L'))
|
|
#define ascii_M ((uint8_t)('M'))
|
|
#define ascii_N ((uint8_t)('N'))
|
|
#define ascii_O ((uint8_t)('O'))
|
|
#define ascii_P ((uint8_t)('P'))
|
|
#define ascii_Q ((uint8_t)('Q'))
|
|
#define ascii_R ((uint8_t)('R'))
|
|
#define ascii_S ((uint8_t)('S'))
|
|
#define ascii_T ((uint8_t)('T'))
|
|
#define ascii_U ((uint8_t)('U'))
|
|
#define ascii_V ((uint8_t)('V'))
|
|
#define ascii_W ((uint8_t)('W'))
|
|
#define ascii_X ((uint8_t)('X'))
|
|
#define ascii_Y ((uint8_t)('Y'))
|
|
#define ascii_Z ((uint8_t)('Z'))
|
|
#define ascii_a ((uint8_t)('a'))
|
|
#define ascii_b ((uint8_t)('b'))
|
|
#define ascii_c ((uint8_t)('c'))
|
|
#define ascii_d ((uint8_t)('d'))
|
|
#define ascii_e ((uint8_t)('e'))
|
|
#define ascii_f ((uint8_t)('f'))
|
|
#define ascii_g ((uint8_t)('g'))
|
|
#define ascii_h ((uint8_t)('h'))
|
|
#define ascii_i ((uint8_t)('i'))
|
|
#define ascii_j ((uint8_t)('j'))
|
|
#define ascii_k ((uint8_t)('k'))
|
|
#define ascii_l ((uint8_t)('l'))
|
|
#define ascii_m ((uint8_t)('m'))
|
|
#define ascii_n ((uint8_t)('n'))
|
|
#define ascii_o ((uint8_t)('o'))
|
|
#define ascii_p ((uint8_t)('p'))
|
|
#define ascii_q ((uint8_t)('q'))
|
|
#define ascii_r ((uint8_t)('r'))
|
|
#define ascii_s ((uint8_t)('s'))
|
|
#define ascii_t ((uint8_t)('t'))
|
|
#define ascii_u ((uint8_t)('u'))
|
|
#define ascii_v ((uint8_t)('v'))
|
|
#define ascii_w ((uint8_t)('w'))
|
|
#define ascii_x ((uint8_t)('x'))
|
|
#define ascii_y ((uint8_t)('y'))
|
|
#define ascii_z ((uint8_t)('z'))
|
|
#define ascii_space ((uint8_t)(' '))
|
|
#define ascii_zero ((uint8_t)('0'))
|
|
#define ascii_0 ((uint8_t)('0'))
|
|
#define ascii_1 ((uint8_t)('1'))
|
|
#define ascii_2 ((uint8_t)('2'))
|
|
#define ascii_3 ((uint8_t)('3'))
|
|
#define ascii_4 ((uint8_t)('4'))
|
|
#define ascii_5 ((uint8_t)('5'))
|
|
#define ascii_6 ((uint8_t)('6'))
|
|
#define ascii_7 ((uint8_t)('7'))
|
|
#define ascii_8 ((uint8_t)('8'))
|
|
#define ascii_9 ((uint8_t)('9'))
|
|
#define ascii_nine ((uint8_t)('9'))
|
|
#define ascii_period ((uint8_t)('.'))
|
|
#define ascii_colon ((uint8_t)(':'))
|
|
#define ascii_comma ((uint8_t)(','))
|
|
#define ascii_dollar_sign ((uint8_t)('$'))
|
|
#define ascii_dquote ((uint8_t)('"'))
|
|
#define ascii_oparen ((uint8_t)('('))
|
|
#define ascii_caret ((uint8_t)('^'))
|
|
#define ascii_slash ((uint8_t)('/'))
|
|
#define ascii_plus ((uint8_t)('+'))
|
|
#define ascii_minus ((uint8_t)('-'))
|
|
#define ascii_hyphen ((uint8_t)('-'))
|
|
#define ascii_underscore ((uint8_t)('_'))
|
|
#define ascii_asterisk ((uint8_t)('*'))
|
|
#define ascii_query ((uint8_t)('?'))
|
|
#define ascii_cr ((uint8_t)('\r'))
|
|
#define ascii_ff ((uint8_t)('\f'))
|
|
#define ascii_newline ((uint8_t)('\n'))
|
|
#define ascii_return ((uint8_t)('\r'))
|
|
|
|
extern unsigned char __gg__data_space[1] ;
|
|
extern unsigned char __gg__data_low_values[1] ;
|
|
extern unsigned char __gg__data_zeros[1] ;
|
|
extern unsigned char __gg__data_high_values[1] ;
|
|
extern unsigned char __gg__data_quotes[1] ;
|
|
extern unsigned char __gg__data_upsi_0[2] ;
|
|
extern short __gg__data_return_code ;
|
|
|
|
// These are the various hardcoded tables used for conversions.
|
|
extern const unsigned short __gg__one_to_one_values[256];
|
|
extern const unsigned short __gg__cp1252_to_cp1140_values[256];
|
|
extern const unsigned short __gg__cp1140_to_cp1252_values[256];
|
|
|
|
// These are the two standard collations.
|
|
extern const unsigned short __gg__cp1252_to_ebcdic_collation[256];
|
|
extern const unsigned short __gg__ebcdic_to_cp1252_collation[256];
|
|
|
|
const char * __gg__encoding_iconv_name( cbl_encoding_t encoding );
|
|
cbl_encoding_t __gg__encoding_iconv_type( const char *name );
|
|
|
|
char * __gg__iconverter(cbl_encoding_t from,
|
|
cbl_encoding_t to,
|
|
const char *str,
|
|
size_t length,
|
|
size_t *outlength);
|
|
|
|
#define DEFAULT_CHARMAP_SOURCE (iconv_CP1252_e)
|
|
|
|
class charmap_t
|
|
{
|
|
private:
|
|
// This is the encoding of this character map
|
|
cbl_encoding_t m_encoding;
|
|
|
|
enum
|
|
{
|
|
sign_type_ascii,
|
|
sign_type_ebcdic,
|
|
} m_numeric_sign_type;
|
|
|
|
// This map retains the ASCII-to-encoded value in m_encoding, so that iconv
|
|
// need be called but once for each ASCII value.
|
|
std::unordered_map<int, int>m_map_of_encodings;
|
|
|
|
void determine_sign_type()
|
|
{
|
|
if( mapped_character(ascii_0) & 0x80 )
|
|
{
|
|
m_numeric_sign_type = sign_type_ebcdic;
|
|
}
|
|
else
|
|
{
|
|
m_numeric_sign_type = sign_type_ascii;
|
|
}
|
|
}
|
|
|
|
public:
|
|
explicit charmap_t(cbl_encoding_t e) : m_encoding(e)
|
|
{
|
|
determine_sign_type();
|
|
}
|
|
explicit charmap_t(uint16_t e) : m_encoding(static_cast<cbl_encoding_t>(e))
|
|
{
|
|
determine_sign_type();
|
|
}
|
|
|
|
int mapped_character(int ch)
|
|
{
|
|
// The assumption is that anybody calling this routine is providing
|
|
// a single-byte character in the DEFAULT_CHARMAP_SOURCE encoding. We
|
|
// return the equivalent character in the m_encoding
|
|
int retval;
|
|
std::unordered_map<int, int>::const_iterator it =
|
|
m_map_of_encodings.find(ch);
|
|
if( it != m_map_of_encodings.end() )
|
|
{
|
|
retval = it->second;
|
|
}
|
|
else
|
|
{
|
|
retval = 0;
|
|
size_t outlength = 0;
|
|
const char *mapped = __gg__iconverter(DEFAULT_CHARMAP_SOURCE,
|
|
m_encoding,
|
|
PTRCAST(char, &ch),
|
|
1,
|
|
&outlength);
|
|
memcpy(&retval, mapped, outlength);
|
|
m_map_of_encodings[ch] = retval;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
int decimal_point()
|
|
{
|
|
return mapped_character(__gg__decimal_point);
|
|
}
|
|
int decimal_separator()
|
|
{
|
|
return mapped_character(__gg__decimal_separator);
|
|
}
|
|
int quote_character()
|
|
{
|
|
return mapped_character(__gg__quote_character);
|
|
}
|
|
int low_value_character()
|
|
{
|
|
return __gg__low_value_character;
|
|
}
|
|
int high_value_character()
|
|
{
|
|
return __gg__high_value_character;
|
|
}
|
|
|
|
int figconst_character(cbl_figconst_t figconst)
|
|
{
|
|
int const_char = 0; // Head off a compiler warning
|
|
switch(figconst)
|
|
{
|
|
case normal_value_e :
|
|
const_char = -1;
|
|
break;
|
|
case low_value_e :
|
|
const_char = low_value_character();
|
|
break;
|
|
case zero_value_e :
|
|
const_char = mapped_character(ascii_0);
|
|
break;
|
|
case space_value_e :
|
|
const_char = mapped_character(ascii_space);
|
|
break;
|
|
case quote_value_e :
|
|
const_char = quote_character();
|
|
break;
|
|
case high_value_e :
|
|
const_char = high_value_character();
|
|
break;
|
|
case null_value_e:
|
|
const_char = '\0';
|
|
break;
|
|
default:
|
|
abort();
|
|
break;
|
|
}
|
|
return const_char;
|
|
}
|
|
|
|
bool
|
|
is_digit_negative(int digit)
|
|
{
|
|
bool retval;
|
|
switch(m_numeric_sign_type)
|
|
{
|
|
case sign_type_ascii:
|
|
retval = !!(digit & NUMERIC_DISPLAY_SIGN_BIT_ASCII);
|
|
break;
|
|
|
|
case sign_type_ebcdic:
|
|
retval = !!((~digit) & NUMERIC_DISPLAY_SIGN_BIT_EBCDIC);
|
|
break;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
int
|
|
set_digit_negative(int digit, bool is_negative)
|
|
{
|
|
switch(m_numeric_sign_type)
|
|
{
|
|
case sign_type_ascii:
|
|
if( is_negative )
|
|
{
|
|
digit |= NUMERIC_DISPLAY_SIGN_BIT_ASCII;
|
|
}
|
|
else
|
|
{
|
|
digit &= ~NUMERIC_DISPLAY_SIGN_BIT_ASCII;
|
|
}
|
|
break;
|
|
|
|
case sign_type_ebcdic:
|
|
if( is_negative )
|
|
{
|
|
digit &= ~NUMERIC_DISPLAY_SIGN_BIT_EBCDIC;
|
|
}
|
|
else
|
|
{
|
|
digit |= NUMERIC_DISPLAY_SIGN_BIT_EBCDIC;
|
|
}
|
|
break;
|
|
}
|
|
return digit;
|
|
}
|
|
|
|
bool
|
|
is_like_ebcdic() const
|
|
{
|
|
return m_numeric_sign_type == sign_type_ebcdic;
|
|
}
|
|
|
|
};
|
|
|
|
charmap_t *__gg__get_charmap(cbl_encoding_t encoding);
|
|
|
|
#endif |