mirror of
https://github.com/NationalSecurityAgency/ghidra.git
synced 2024-11-28 23:21:46 +00:00
Merge remote-tracking branch 'origin/GP-4160_dev747368_charset_logic--SQUASHED'
This commit is contained in:
commit
9226e7e090
@ -45,8 +45,6 @@ import ghidra.util.*;
|
||||
*/
|
||||
public class StringDataInstance {
|
||||
|
||||
private static final int ASCII_MAX = 0x7f;
|
||||
|
||||
/**
|
||||
* Returns true if the {@link Data} instance is a 'string'.
|
||||
*
|
||||
@ -61,8 +59,8 @@ public class StringDataInstance {
|
||||
if (dt instanceof AbstractStringDataType) {
|
||||
return true;
|
||||
}
|
||||
if (dt instanceof Array) {
|
||||
ArrayStringable as = ArrayStringable.getArrayStringable(((Array) dt).getDataType());
|
||||
if (dt instanceof Array arrayDt) {
|
||||
ArrayStringable as = ArrayStringable.getArrayStringable(arrayDt.getDataType());
|
||||
return (as != null) && as.hasStringValue(data);
|
||||
}
|
||||
return false;
|
||||
@ -79,11 +77,11 @@ public class StringDataInstance {
|
||||
* @return boolean true if data type is or could be a string
|
||||
*/
|
||||
public static boolean isStringDataType(DataType dt) {
|
||||
if (dt instanceof TypeDef) {
|
||||
dt = ((TypeDef) dt).getBaseDataType();
|
||||
if (dt instanceof TypeDef td) {
|
||||
dt = td.getBaseDataType();
|
||||
}
|
||||
return dt instanceof AbstractStringDataType || (dt instanceof Array &&
|
||||
ArrayStringable.getArrayStringable(((Array) dt).getDataType()) != null);
|
||||
return dt instanceof AbstractStringDataType || (dt instanceof Array arrayDt &&
|
||||
ArrayStringable.getArrayStringable(arrayDt.getDataType()) != null);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -161,13 +159,12 @@ public class StringDataInstance {
|
||||
return NULL_INSTANCE;
|
||||
}
|
||||
DataType dt = data.getBaseDataType();
|
||||
if (dt instanceof AbstractStringDataType) {
|
||||
return ((AbstractStringDataType) dt).getStringDataInstance(data, data,
|
||||
data.getLength());
|
||||
if (dt instanceof AbstractStringDataType asdt) {
|
||||
return asdt.getStringDataInstance(data, data, data.getLength());
|
||||
}
|
||||
if (dt instanceof Array && data.isInitializedMemory()) {
|
||||
if (dt instanceof Array arrayDt && data.isInitializedMemory()) {
|
||||
ArrayStringable arrayStringable =
|
||||
ArrayStringable.getArrayStringable(((Array) dt).getDataType());
|
||||
ArrayStringable.getArrayStringable(arrayDt.getDataType());
|
||||
if (arrayStringable != null && arrayStringable.hasStringValue(data)) {
|
||||
return new StringDataInstance(arrayStringable, data, data, data.getLength(), true);
|
||||
}
|
||||
@ -188,15 +185,15 @@ public class StringDataInstance {
|
||||
*/
|
||||
public static StringDataInstance getStringDataInstance(DataType dataType, MemBuffer buf,
|
||||
Settings settings, int length) {
|
||||
if (dataType instanceof AbstractStringDataType) {
|
||||
return ((AbstractStringDataType) dataType).getStringDataInstance(buf, settings, length);
|
||||
if (dataType instanceof AbstractStringDataType asdt) {
|
||||
return asdt.getStringDataInstance(buf, settings, length);
|
||||
}
|
||||
boolean isArray = dataType instanceof Array;
|
||||
if (isArray) {
|
||||
dataType = ArrayStringable.getArrayStringable(((Array) dataType).getDataType());
|
||||
}
|
||||
if (dataType instanceof ArrayStringable &&
|
||||
((ArrayStringable) dataType).hasStringValue(settings) && buf.isInitializedMemory()) {
|
||||
if (dataType instanceof ArrayStringable arrayStringable &&
|
||||
arrayStringable.hasStringValue(settings) && buf.isInitializedMemory()) {
|
||||
|
||||
// this could be either a charsequence or an array of char elements
|
||||
return new StringDataInstance(dataType, settings, buf, length, isArray);
|
||||
@ -250,11 +247,6 @@ public class StringDataInstance {
|
||||
public static final String UNKNOWN = "??";
|
||||
public static final String UNKNOWN_DOT_DOT_DOT = "??...";
|
||||
|
||||
/**
|
||||
* A string with a single char that is the Byte-Order-Mark character.
|
||||
*/
|
||||
private static final String BOM_RESULT_STR = "\ufeff";
|
||||
|
||||
static final int SIZEOF_PASCAL255_STR_LEN_FIELD = 1;
|
||||
static final int SIZEOF_PASCAL64k_STR_LEN_FIELD = 2;
|
||||
|
||||
@ -339,8 +331,7 @@ public class StringDataInstance {
|
||||
|
||||
private static String getTranslatedValue(Settings settings, MemBuffer buf) {
|
||||
// Translation only exists for defined Data which corresponds to settings.
|
||||
if (settings instanceof Data) {
|
||||
Data data = (Data) settings;
|
||||
if (settings instanceof Data data) {
|
||||
if (data.isDefined()) {
|
||||
return TRANSLATION.getTranslatedValue(data);
|
||||
}
|
||||
@ -374,8 +365,8 @@ public class StringDataInstance {
|
||||
}
|
||||
|
||||
private static StringLayoutEnum getLayoutFromDataType(DataType dataType) {
|
||||
if (dataType instanceof AbstractStringDataType) {
|
||||
return ((AbstractStringDataType) dataType).getStringLayout();
|
||||
if (dataType instanceof AbstractStringDataType asdt) {
|
||||
return asdt.getStringLayout();
|
||||
}
|
||||
if (dataType instanceof AbstractIntegerDataType || dataType instanceof BitFieldDataType) {
|
||||
return StringLayoutEnum.CHAR_SEQ;
|
||||
@ -384,11 +375,11 @@ public class StringDataInstance {
|
||||
}
|
||||
|
||||
static String getCharsetNameFromDataTypeOrSettings(DataType dataType, Settings settings) {
|
||||
if (dataType instanceof BitFieldDataType) {
|
||||
dataType = ((BitFieldDataType) dataType).getBaseDataType();
|
||||
if (dataType instanceof BitFieldDataType bfdt) {
|
||||
dataType = bfdt.getBaseDataType();
|
||||
}
|
||||
return (dataType instanceof DataTypeWithCharset)
|
||||
? ((DataTypeWithCharset) dataType).getCharsetName(settings)
|
||||
return (dataType instanceof DataTypeWithCharset dtwcs)
|
||||
? dtwcs.getCharsetName(settings)
|
||||
: DEFAULT_CHARSET_NAME;
|
||||
}
|
||||
|
||||
@ -586,12 +577,16 @@ public class StringDataInstance {
|
||||
}
|
||||
byte[] stringBytes = convertPaddedToUnpadded(getStringBytes());
|
||||
if (stringBytes == null) {
|
||||
return StringDataInstance.UNKNOWN_DOT_DOT_DOT;
|
||||
return UNKNOWN_DOT_DOT_DOT;
|
||||
}
|
||||
AdjustedCharsetInfo aci = getAdjustedCharsetInfo(stringBytes);
|
||||
String str = convertBytesToString(stringBytes, aci);
|
||||
ByteBuffer bb = ByteBuffer.wrap(stringBytes);
|
||||
String adjustedCharsetName = getAdjustedCharsetInfo(bb); // force BE or LE variants of UTF charsets
|
||||
|
||||
return str;
|
||||
if (!Charset.isSupported(adjustedCharsetName)) {
|
||||
return UNKNOWN_DOT_DOT_DOT;
|
||||
}
|
||||
Charset cs = Charset.forName(adjustedCharsetName);
|
||||
return new String(stringBytes, cs);
|
||||
}
|
||||
|
||||
private byte[] getStringBytes() {
|
||||
@ -709,13 +704,6 @@ public class StringDataInstance {
|
||||
return buf.isBigEndian() ? Endian.BIG : Endian.LITTLE;
|
||||
}
|
||||
|
||||
private String convertBytesToString(byte[] bytes, AdjustedCharsetInfo aci) {
|
||||
Charset cs = Charset.isSupported(aci.charsetName) ? Charset.forName(aci.charsetName) : null;
|
||||
return (cs != null)
|
||||
? new String(bytes, aci.byteStartOffset, bytes.length - aci.byteStartOffset, cs)
|
||||
: convertBytesToStringCustomCharset(bytes, aci);
|
||||
}
|
||||
|
||||
private AdjustedCharsetInfo getAdjustedCharsetInfo() {
|
||||
if (length == -1 && getStringLength() == -1) {
|
||||
return getAdjustedCharsetInfo(new byte[] {});
|
||||
@ -751,30 +739,21 @@ public class StringDataInstance {
|
||||
return result;
|
||||
}
|
||||
|
||||
private static DataConverter getDataConverter(Endian endian) {
|
||||
return endian == Endian.BIG ? BigEndianDataConverter.INSTANCE
|
||||
: LittleEndianDataConverter.INSTANCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Converts a byte array to String based on a custom Ghidra charset name.
|
||||
*/
|
||||
private static String convertBytesToStringCustomCharset(byte[] bytes, AdjustedCharsetInfo aci) {
|
||||
switch (aci.charsetName) {
|
||||
case "UTF-32LE":
|
||||
case "UTF-32BE":
|
||||
// fall-back because real jvm supplied UTF-32 Charset isn't guaranteed to be present
|
||||
DataConverter dc = getDataConverter(aci.endian);
|
||||
int[] codePoints = new int[(bytes.length - aci.byteStartOffset) / 4];
|
||||
for (int i = 0; i < codePoints.length; i++) {
|
||||
codePoints[i] = dc.getInt(bytes, aci.byteStartOffset + (i * 4));
|
||||
if (codePoints[i] < 0 || codePoints[i] > Character.MAX_CODE_POINT) {
|
||||
codePoints[i] = StringUtilities.UNICODE_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
return new String(codePoints, 0, codePoints.length);
|
||||
private String getAdjustedCharsetInfo(ByteBuffer bb) {
|
||||
String result = charsetName;
|
||||
if (CharsetInfo.isBOMCharset(charsetName)) {
|
||||
Endian endian = getEndiannessFromBOM(bb, charSize);
|
||||
if (endian == null) {
|
||||
endian = endianSetting;
|
||||
}
|
||||
if (endian == null) {
|
||||
endian = getMemoryEndianness();
|
||||
}
|
||||
// add "LE" or "BE" to end of charset's name depending
|
||||
// of the discovered endianness of the string
|
||||
result += endian.toShortString();
|
||||
}
|
||||
return null;
|
||||
return result;
|
||||
}
|
||||
|
||||
private static Endian getEndiannessFromBOM(byte[] bytes, int charSize) {
|
||||
@ -792,6 +771,25 @@ public class StringDataInstance {
|
||||
return null;
|
||||
}
|
||||
|
||||
private static Endian getEndiannessFromBOM(ByteBuffer bb, int charSize) {
|
||||
if (bb.remaining() < charSize) {
|
||||
return null;
|
||||
}
|
||||
|
||||
byte[] bytes = new byte[charSize];
|
||||
bb.get(0, bytes);
|
||||
|
||||
int be_val = (int) BigEndianDataConverter.INSTANCE.getValue(bytes, charSize);
|
||||
switch (be_val) {
|
||||
case StringUtilities.UNICODE_BE_BYTE_ORDER_MARK:
|
||||
return Endian.BIG;
|
||||
case StringUtilities.UNICODE_LE16_BYTE_ORDER_MARK:
|
||||
case StringUtilities.UNICODE_LE32_BYTE_ORDER_MARK:
|
||||
return Endian.LITTLE;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a formatted version of the string returned by {@link #getStringValue()}.
|
||||
* <p>
|
||||
@ -806,8 +804,8 @@ public class StringDataInstance {
|
||||
*/
|
||||
public String getStringRepresentation() {
|
||||
return showTranslation && translatedValue != null
|
||||
? getTranslatedStringRepresentation(translatedValue)
|
||||
: getStringRep(StringRenderBuilder.DOUBLE_QUOTE, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
? getTranslatedStringRepresentation(translatedValue)
|
||||
: getStringRep(StringRenderBuilder.DOUBLE_QUOTE);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -824,13 +822,15 @@ public class StringDataInstance {
|
||||
* @return formatted String
|
||||
*/
|
||||
public String getStringRepresentation(boolean originalOrTranslated) {
|
||||
if (!originalOrTranslated && translatedValue == null) {
|
||||
return UNKNOWN;
|
||||
}
|
||||
return originalOrTranslated
|
||||
? getStringRep(StringRenderBuilder.DOUBLE_QUOTE, StringRenderBuilder.DOUBLE_QUOTE)
|
||||
: translatedValue != null ? getTranslatedStringRepresentation(translatedValue)
|
||||
: UNKNOWN;
|
||||
? getStringRep(StringRenderBuilder.DOUBLE_QUOTE)
|
||||
: getTranslatedStringRepresentation(translatedValue);
|
||||
}
|
||||
|
||||
private String getStringRep(char quoteChar, char quoteCharMulti) {
|
||||
private String getStringRep(char quoteChar) {
|
||||
|
||||
if (isProbe() || isBadCharSize() || !buf.isInitializedMemory()) {
|
||||
return UNKNOWN;
|
||||
@ -840,122 +840,22 @@ public class StringDataInstance {
|
||||
if (stringBytes == null) {
|
||||
return UNKNOWN_DOT_DOT_DOT;
|
||||
}
|
||||
AdjustedCharsetInfo aci = getAdjustedCharsetInfo(stringBytes);
|
||||
String stringValue = convertBytesToString(stringBytes, aci);
|
||||
if (stringValue == null) {
|
||||
|
||||
ByteBuffer bb = ByteBuffer.wrap(stringBytes);
|
||||
String adjustedCharsetName = getAdjustedCharsetInfo(bb); // force BE or LE variants of UTF charsets
|
||||
|
||||
StringRenderBuilder renderer =
|
||||
new StringRenderBuilder(adjustedCharsetName.startsWith("UTF"), charSize, quoteChar);
|
||||
|
||||
if (!Charset.isSupported(adjustedCharsetName)) {
|
||||
return UNKNOWN_DOT_DOT_DOT;
|
||||
}
|
||||
|
||||
if (stringValue.length() == 0 && aci.byteStartOffset != 0) {
|
||||
// If the byteStartOffset isn't zero it means there was one char that was the unicode BOM.
|
||||
// Asking the Charset to decode it returned nothing, so force it.
|
||||
stringValue = BOM_RESULT_STR;
|
||||
}
|
||||
|
||||
// if we get the same number of characters out that we put into the decoder,
|
||||
// then its a good chance there is a one-to-one correspondence between original char
|
||||
// offsets and decoded char offsets.
|
||||
boolean isByteToStringCharEquiv =
|
||||
stringValue.length() == ((stringBytes.length - aci.byteStartOffset) / charSize);
|
||||
|
||||
stringValue = stringLayout.shouldTrimTrailingNulls() ? trimNulls(stringValue) : stringValue;
|
||||
|
||||
StringRenderBuilder strBuf = new StringRenderBuilder(charSize,
|
||||
stringValue.length() == 1 ? quoteChar : quoteCharMulti);
|
||||
|
||||
if (stringValue.isEmpty() || (stringValue.length() == 1 && stringValue.charAt(0) == 0)) {
|
||||
// force the string renderer into "string" mode so we get empty quotes when done.
|
||||
strBuf.addString("");
|
||||
}
|
||||
|
||||
// For each 32bit character in the java string try to add it to the StringRenderBuilder
|
||||
for (int i = 0, strLength = stringValue.length(); i < strLength;) {
|
||||
int codePoint = stringValue.codePointAt(i);
|
||||
|
||||
RENDER_ENUM currentCharRenderSetting = renderSetting;
|
||||
if (codePoint == StringUtilities.UNICODE_REPLACEMENT && isByteToStringCharEquiv &&
|
||||
!isReplacementCharAt(stringBytes, i * charSize + aci.byteStartOffset)) {
|
||||
// if this is a true decode error and we can recover the original bytes,
|
||||
// then force the render mode to byte seq.
|
||||
currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
|
||||
}
|
||||
|
||||
if (StringUtilities.isControlCharacterOrBackslash(codePoint)) {
|
||||
strBuf.addString(StringUtilities.convertCodePointToEscapeSequence(codePoint));
|
||||
}
|
||||
else if (codePoint == 0x0000 && renderSetting != RENDER_ENUM.BYTE_SEQ) {
|
||||
strBuf.addEscapedChar('0');
|
||||
}
|
||||
else if (StringUtilities.isDisplayable(codePoint)) {
|
||||
strBuf.addCodePointChar(codePoint);
|
||||
}
|
||||
else {
|
||||
// not simple ascii, decide how to handle:
|
||||
// add the character to the string in a format depending on the
|
||||
// render settings. ISO control chars are forced to be
|
||||
// escaped regardless of the render setting.
|
||||
if (currentCharRenderSetting == RENDER_ENUM.ALL) {
|
||||
if (codePoint <= ASCII_MAX) {
|
||||
// render non-displayable, non-control-char ascii-ish bytes as bytes instead
|
||||
// of as escape sequences
|
||||
currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
|
||||
}
|
||||
else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
|
||||
codePoint == StringUtilities.UNICODE_BE_BYTE_ORDER_MARK) {
|
||||
currentCharRenderSetting = RENDER_ENUM.ESC_SEQ;
|
||||
}
|
||||
}
|
||||
|
||||
switch (currentCharRenderSetting) {
|
||||
case ALL:
|
||||
strBuf.addCodePointChar(codePoint);
|
||||
break;
|
||||
case BYTE_SEQ:
|
||||
strBuf.addByteSeq(getOriginalBytes(isByteToStringCharEquiv, i, codePoint,
|
||||
stringBytes, aci));
|
||||
break;
|
||||
case ESC_SEQ:
|
||||
strBuf.addEscapedCodePoint(codePoint);
|
||||
break;
|
||||
}
|
||||
}
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
String prefix = "";
|
||||
if (charsetName.startsWith("UTF") && strBuf.startsWithQuotedText()) {
|
||||
switch (charSize) {
|
||||
case 1:
|
||||
prefix = "u8";
|
||||
break;
|
||||
case 2:
|
||||
prefix = "u";
|
||||
break;
|
||||
case 4:
|
||||
prefix = "U";
|
||||
break;
|
||||
}
|
||||
}
|
||||
return prefix + strBuf.toString();
|
||||
}
|
||||
|
||||
private byte[] getOriginalBytes(boolean isByteToStringCharEquiv, int charOffset, int codePoint,
|
||||
byte[] stringBytes, AdjustedCharsetInfo aci) {
|
||||
|
||||
if (isByteToStringCharEquiv) {
|
||||
byte[] originalCharBytes = new byte[charSize];
|
||||
System.arraycopy(stringBytes, charOffset * charSize + aci.byteStartOffset,
|
||||
originalCharBytes, 0, charSize);
|
||||
return originalCharBytes;
|
||||
}
|
||||
|
||||
// can't get original bytes, cheat and run the codePoint through the charset
|
||||
// to get what should be the same as the original bytes.
|
||||
String singleCharStr = new String(new int[] { codePoint }, 0, 1);
|
||||
Charset cs = Charset.isSupported(aci.charsetName) ? Charset.forName(aci.charsetName) : null;
|
||||
if (cs == null || !cs.canEncode()) {
|
||||
return null;
|
||||
}
|
||||
return singleCharStr.getBytes(cs);
|
||||
Charset cs = Charset.forName(adjustedCharsetName);
|
||||
renderer.decodeBytesUsingCharset(bb, cs, renderSetting,
|
||||
stringLayout.shouldTrimTrailingNulls());
|
||||
|
||||
String result = renderer.build();
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1024,17 +924,11 @@ public class StringDataInstance {
|
||||
StringDataInstance charseqSDI =
|
||||
new StringDataInstance(this, StringLayoutEnum.CHAR_SEQ, buf, length, newCSName);
|
||||
|
||||
return charseqSDI.getStringRep(StringRenderBuilder.SINGLE_QUOTE,
|
||||
StringRenderBuilder.DOUBLE_QUOTE);
|
||||
}
|
||||
char quoteChar = length == charSize
|
||||
? StringRenderBuilder.SINGLE_QUOTE
|
||||
: StringRenderBuilder.DOUBLE_QUOTE;
|
||||
|
||||
private boolean isReplacementCharAt(byte[] stringBytes, int byteOffset) {
|
||||
if (byteOffset + charSize > stringBytes.length) {
|
||||
return false;
|
||||
}
|
||||
long origCodePointValue = DataConverter.getInstance(buf.isBigEndian())
|
||||
.getValue(stringBytes, byteOffset, charSize);
|
||||
return origCodePointValue == StringUtilities.UNICODE_REPLACEMENT;
|
||||
return charseqSDI.getStringRep(quoteChar);
|
||||
}
|
||||
|
||||
private static String getTranslatedStringRepresentation(String translatedString) {
|
||||
|
@ -15,16 +15,20 @@
|
||||
*/
|
||||
package ghidra.program.model.data;
|
||||
|
||||
import ghidra.util.StringFormat;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.*;
|
||||
|
||||
import ghidra.program.model.data.RenderUnicodeSettingsDefinition.RENDER_ENUM;
|
||||
import ghidra.util.StringUtilities;
|
||||
|
||||
/**
|
||||
* Helper class used to build up a formatted (for human consumption) string representation returned
|
||||
* by Unicode and String data types.
|
||||
* <p>
|
||||
* Call {@link #toString()} to retrieve the formatted string.
|
||||
* Call {@link #build()} to retrieve the formatted string.
|
||||
* <p>
|
||||
* Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
|
||||
* Example (quotes are part of result): {@code "Test\tstring",01h,02h,"Second\npart"}
|
||||
*
|
||||
*/
|
||||
public class StringRenderBuilder {
|
||||
@ -32,111 +36,29 @@ public class StringRenderBuilder {
|
||||
public static final char SINGLE_QUOTE = '\'';
|
||||
private static final int MAX_ASCII = 0x80;
|
||||
|
||||
private StringBuilder sb = new StringBuilder();
|
||||
private boolean byteMode = true;
|
||||
private final char quoteChar;
|
||||
private final StringBuilder sb = new StringBuilder();
|
||||
private final int charSize;
|
||||
private final boolean utfCharset;
|
||||
private final char quoteChar;
|
||||
private boolean byteMode = true;
|
||||
|
||||
public StringRenderBuilder(int charSize) {
|
||||
this(charSize, DOUBLE_QUOTE);
|
||||
public StringRenderBuilder(boolean utfCharset, int charSize) {
|
||||
this(utfCharset, charSize, DOUBLE_QUOTE);
|
||||
}
|
||||
|
||||
public StringRenderBuilder(int charSize, char quoteChar) {
|
||||
public StringRenderBuilder(boolean utfCharset, int charSize, char quoteChar) {
|
||||
this.charSize = charSize;
|
||||
this.utfCharset = utfCharset;
|
||||
this.quoteChar = quoteChar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the current formatted string starts with a quoted text section,
|
||||
* instead of a byte value section. Useful to indicate if
|
||||
* the string could have a prefix applied to it (ie. u8"text")
|
||||
* <p>
|
||||
*
|
||||
* @return boolean true if this string will start with a quoted text section
|
||||
*/
|
||||
public boolean startsWithQuotedText() {
|
||||
return sb.length() > 0 && sb.charAt(0) == quoteChar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the characters in the specified string. The added characters will
|
||||
* be shown in a quoted text region.
|
||||
*
|
||||
* @param str String to add
|
||||
*/
|
||||
public void addString(String str) {
|
||||
ensureTextMode();
|
||||
sb.append(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the specified char after an escaping backslash "\", ie
|
||||
* {@literal "x" -> "\x";}
|
||||
*
|
||||
* @param ch
|
||||
*/
|
||||
public void addEscapedChar(char ch) {
|
||||
ensureTextMode();
|
||||
sb.append("\\").append(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a single character. It will be shown in a quoted text region.
|
||||
*
|
||||
* @param codePoint Character to add
|
||||
*/
|
||||
public void addCodePointChar(int codePoint) {
|
||||
ensureTextMode();
|
||||
if (codePoint == quoteChar) {
|
||||
sb.append("\\");
|
||||
}
|
||||
sb.appendCodePoint(codePoint);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a single character that needs to be shown as a numeric hex value.
|
||||
*
|
||||
* @param codePoint Character to add
|
||||
*/
|
||||
public void addCodePointValue(int codePoint) {
|
||||
ensureByteMode();
|
||||
String valStr = Integer.toHexString(codePoint).toUpperCase();
|
||||
valStr = (valStr.length() < charSize * 2)
|
||||
? StringFormat.padIt(valStr, charSize * 2, (char) 0, true)
|
||||
: valStr;
|
||||
sb.append(valStr);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add byte values, shown as numeric hex values.
|
||||
* <p>
|
||||
* {@literal { 0, 1, 2 } -> 00,01,02}
|
||||
*
|
||||
* @param bytes to convert to hex and append. If null, append "???"
|
||||
*/
|
||||
public void addByteSeq(byte[] bytes) {
|
||||
if (bytes == null) {
|
||||
ensureByteMode();
|
||||
sb.append("???");
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < bytes.length; i++) {
|
||||
ensureByteMode();
|
||||
String valStr = Integer.toHexString(bytes[i] & 0xff).toUpperCase();
|
||||
if (valStr.length() < 2) {
|
||||
sb.append("0");
|
||||
}
|
||||
sb.append(valStr).append("h");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an unicode codepoint as its escaped hex value, with a escape character
|
||||
* Add a unicode codepoint as its escaped hex value, with a escape character
|
||||
* prefix of 'x', 'u' or 'U' depending on the magnitude of the codePoint value.
|
||||
* <p>
|
||||
* {@literal codePoint 15 -> '\' 'x' "0F"}<br>
|
||||
* {@literal codePoint 65535 -> '\' 'u' "FFFF"}<br>
|
||||
* {@literal codePoint 65536 -> '\' 'U' "10000"}<br>
|
||||
* {@literal codePoint 65536 -> '\' 'U' "00010000"}<br>
|
||||
*
|
||||
* @param codePoint int value
|
||||
*/
|
||||
@ -151,18 +73,117 @@ public class StringRenderBuilder {
|
||||
}
|
||||
|
||||
/**
|
||||
* Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
|
||||
* Adds the characters found in the supplied {@link ByteBuffer} to the result.
|
||||
* <p>
|
||||
* @return Formatted string
|
||||
* Any portions of the byte buffer that cause problems for the charset codec will be added
|
||||
* as a {@link #addByteSeq(ByteBuffer, int) byte sequence}.
|
||||
* <p>
|
||||
* Characters that are outside the traditional ASCII range will be rendered as-is or as
|
||||
* escape sequences, depending on the RENDER_ENUM setting.
|
||||
*
|
||||
* @param bb {@link ByteBuffer} containing bytes of a string
|
||||
* @param cs {@link Charset} that should be used to decode the bytes
|
||||
* @param renderSetting {@link RENDER_ENUM}
|
||||
* @param trimTrailingNulls boolean flag, if true trailing null bytes will not be included
|
||||
* in the rendered output
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
String str = sb.toString();
|
||||
if (!byteMode) {
|
||||
// close the quoted text mode in the local string
|
||||
str = str + quoteChar;
|
||||
public void decodeBytesUsingCharset(ByteBuffer bb, Charset cs, RENDER_ENUM renderSetting,
|
||||
boolean trimTrailingNulls) {
|
||||
CharsetDecoder codec = cs.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
CharBuffer cb = CharBuffer.allocate(Math.min(10, bb.remaining()));
|
||||
while (bb.hasRemaining()) {
|
||||
CoderResult cr = codec.decode(bb, cb, true);
|
||||
if (!bb.hasRemaining() && trimTrailingNulls) {
|
||||
// if this is the last chunk of text, trim nulls if necessary
|
||||
// TODO: this conditional is a bit fragile and could fail to trigger if a long
|
||||
// run of trailing nulls was split over multiple charbuffers. This shouldn't
|
||||
// happen because of the allocated size of the charbuffer is tied to the size of the
|
||||
// input bytebuffer
|
||||
trimTrailingNulls(cb);
|
||||
}
|
||||
flushStringModeCharBuf(cb, renderSetting);
|
||||
if ( cr.isError() ) {
|
||||
addByteSeq(bb, cr.length());
|
||||
}
|
||||
else if (cr.isUnderflow()) {
|
||||
// there was a trailing byte sequence that the charset needs more bytes to
|
||||
// finish. Since we gave the charset all the bytes, any remaining will have to
|
||||
// be rendered as bytes values.
|
||||
// This can also trigger for successful end-of-input, remaining == 0
|
||||
addByteSeq(bb, bb.remaining());
|
||||
}
|
||||
}
|
||||
return str;
|
||||
|
||||
CoderResult flushResult = codec.flush(cb);
|
||||
if (!flushResult.isUnderflow()) {
|
||||
// error, should not happen
|
||||
}
|
||||
flushStringModeCharBuf(cb, renderSetting);
|
||||
}
|
||||
|
||||
private void addString(String str) {
|
||||
ensureTextMode();
|
||||
sb.append(str);
|
||||
}
|
||||
|
||||
private void addCodePointChar(int codePoint) {
|
||||
ensureTextMode();
|
||||
if (codePoint == quoteChar) {
|
||||
sb.append("\\");
|
||||
}
|
||||
sb.appendCodePoint(codePoint);
|
||||
}
|
||||
|
||||
private void addByteSeq(ByteBuffer bytes, int count) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
ensureByteMode();
|
||||
sb.append("%02Xh".formatted(bytes.get()));
|
||||
}
|
||||
}
|
||||
|
||||
private void trimTrailingNulls(CharBuffer cb) {
|
||||
while (cb.position() > 0 && cb.get(cb.position() - 1) == 0) {
|
||||
cb.position(cb.position() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private void flushStringModeCharBuf(CharBuffer cb, RENDER_ENUM renderSetting) {
|
||||
cb.flip();
|
||||
renderChars(cb, renderSetting);
|
||||
cb.clear();
|
||||
}
|
||||
|
||||
private void renderChars(CharSequence stringValue, RENDER_ENUM renderSetting) {
|
||||
for (int i = 0, strLength = stringValue.length(); i < strLength;) {
|
||||
int codePoint = Character.codePointAt(stringValue, i);
|
||||
|
||||
if (StringUtilities.isDisplayable(codePoint)) {
|
||||
addCodePointChar(codePoint);
|
||||
}
|
||||
else if (codePoint == 0) {
|
||||
// TODO: there is an opportunity to make this smarter by not switching from
|
||||
// byte mode to string mode to add nulls.
|
||||
addString("\\0");
|
||||
}
|
||||
else if (StringUtilities.isControlCharacterOrBackslash(codePoint)) {
|
||||
addString(StringUtilities.convertCodePointToEscapeSequence(codePoint));
|
||||
}
|
||||
else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
|
||||
codePoint == StringUtilities.UNICODE_BE_BYTE_ORDER_MARK) {
|
||||
addEscapedCodePoint(codePoint);
|
||||
}
|
||||
else if (renderSetting == RENDER_ENUM.ALL) {
|
||||
addCodePointChar(codePoint);
|
||||
}
|
||||
else {
|
||||
addEscapedCodePoint(codePoint);
|
||||
}
|
||||
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void ensureTextMode() {
|
||||
@ -186,4 +207,34 @@ public class StringRenderBuilder {
|
||||
byteMode = true;
|
||||
}
|
||||
|
||||
public String build() {
|
||||
// TODO: change the string prefix modifier to align with what decompiler does
|
||||
String s = !sb.isEmpty() ? toString() : "%c%c".formatted(quoteChar, quoteChar); // '' won't make sense
|
||||
String prefix = "";
|
||||
if (utfCharset && !s.isEmpty() && s.charAt(0) == quoteChar) {
|
||||
prefix = switch (charSize) {
|
||||
case 1 -> "u8";
|
||||
case 2 -> "u";
|
||||
case 4 -> "U";
|
||||
default -> "";
|
||||
};
|
||||
}
|
||||
return prefix + s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
|
||||
* <p>
|
||||
* @return Formatted string
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
String str = sb.toString();
|
||||
if (!byteMode) {
|
||||
// close the quoted text mode in the local string
|
||||
str += quoteChar;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,229 @@
|
||||
/* ###
|
||||
* IP: GHIDRA
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package ghidra.program.model.data;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import generic.test.AbstractGTest;
|
||||
import ghidra.program.model.data.RenderUnicodeSettingsDefinition.RENDER_ENUM;
|
||||
|
||||
public class StringRenderBuilderTest extends AbstractGTest {
|
||||
|
||||
private ByteBuffer bb(int... values) {
|
||||
return ByteBuffer.wrap(bytes(values));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyString() {
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
String emptyString = srb.build();
|
||||
assertEquals("\"\"", emptyString);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyWChar2String() {
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
String emptyString = srb.build();
|
||||
assertEquals("u\"\"", emptyString);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyWChar4String() {
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(true, 4, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
String emptyString = srb.build();
|
||||
assertEquals("U\"\"", emptyString);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyStringWithNulls() {
|
||||
ByteBuffer bb = bb(0, 0, 0);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String emptyString = srb.build();
|
||||
assertEquals("\"\"", emptyString);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyStringWithNullsNoTrim() {
|
||||
ByteBuffer bb = bb(0, 0, 0);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, false);
|
||||
String s = srb.build();
|
||||
assertEquals("\"\\0\\0\\0\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInteriorNulls() {
|
||||
ByteBuffer bb = bb('t', 'e', 0, 's', 't', 0);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("\"te\\0st\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleString() {
|
||||
ByteBuffer bb = bb('t', 'e', 's', 't');
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("\"test\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStandardEscapedChars() {
|
||||
ByteBuffer bb = bb('t', 'e', 's', 't', '\n', '\t', '\r');
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("\"test\\n\\t\\r\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuotedQuotesChars() {
|
||||
ByteBuffer bb = bb('t', 'e', 's', 't', '"', '1', '2', '3');
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("\"test\\\"123\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleQuoteChars() {
|
||||
ByteBuffer bb = bb('t', 'e', 's', 't', '\'', '1', '2', '3');
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("\"test'123\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleStringWithTrailingNulls() {
|
||||
ByteBuffer bb = bb('t', 'e', 's', 't', 0, 0, 0);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("\"test\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleStringWithTrailingNullsNoTrim() {
|
||||
ByteBuffer bb = bb('t', 'e', 's', 't', 0, 0, 0);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, false);
|
||||
String s = srb.build();
|
||||
assertEquals("\"test\\0\\0\\0\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUtf8String() {
|
||||
ByteBuffer bb = bb(0xE1, 0x84, 0xA2); // should decode to \u1122
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("u8\"\u1122\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUtf8NoRenderNonLatinString() {
|
||||
ByteBuffer bb = bb(0xE1, 0x84, 0xA2); // should decode to \u1122
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ESC_SEQ, true);
|
||||
String s = srb.build();
|
||||
assertEquals("u8\"\\u1122\"", s); // <- result is \ u 1122
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadBytes_USASCII() {
|
||||
ByteBuffer bb = bb('t', 'e', 's', 't', 0x80);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("\"test\",80h", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadBytes_USASCII2() {
|
||||
// bad bytes in interior of string, switching modes
|
||||
ByteBuffer bb = bb('t', 'e', 0x80, 's', 't');
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("\"te\",80h,\"st\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadBytes_USASCII3() {
|
||||
// bad bytes at beginning of string
|
||||
ByteBuffer bb = bb(0x80, 't', 'e', 's', 't');
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("80h,\"test\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTruncatedUtf8() {
|
||||
ByteBuffer bb = bb('t', 'e', 's', 't', 0xE1, 0x84);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("u8\"test\",E1h,84h", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUtf16() {
|
||||
ByteBuffer bb = bb('t', 0, 'e', 0, 's', 0, 't', 0);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_16LE, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("u\"test\"", s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUtf16BOM_LE() {
|
||||
ByteBuffer bb = bb(0xff, 0xfe, 't', 0, 'e', 0, 's', 0, 't', 0);
|
||||
StringRenderBuilder srb =
|
||||
new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
|
||||
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_16LE, RENDER_ENUM.ALL, true);
|
||||
String s = srb.build();
|
||||
assertEquals("u\"\\uFEFFtest\"", s);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user