Merge remote-tracking branch 'origin/GP-4160_dev747368_charset_logic--SQUASHED'

This commit is contained in:
Ryan Kurtz 2023-12-27 07:39:45 -05:00
commit 9226e7e090
3 changed files with 472 additions and 298 deletions

View File

@ -45,8 +45,6 @@ import ghidra.util.*;
*/
public class StringDataInstance {
private static final int ASCII_MAX = 0x7f;
/**
* Returns true if the {@link Data} instance is a 'string'.
*
@ -61,8 +59,8 @@ public class StringDataInstance {
if (dt instanceof AbstractStringDataType) {
return true;
}
if (dt instanceof Array) {
ArrayStringable as = ArrayStringable.getArrayStringable(((Array) dt).getDataType());
if (dt instanceof Array arrayDt) {
ArrayStringable as = ArrayStringable.getArrayStringable(arrayDt.getDataType());
return (as != null) && as.hasStringValue(data);
}
return false;
@ -79,11 +77,11 @@ public class StringDataInstance {
* @return boolean true if data type is or could be a string
*/
public static boolean isStringDataType(DataType dt) {
if (dt instanceof TypeDef) {
dt = ((TypeDef) dt).getBaseDataType();
if (dt instanceof TypeDef td) {
dt = td.getBaseDataType();
}
return dt instanceof AbstractStringDataType || (dt instanceof Array &&
ArrayStringable.getArrayStringable(((Array) dt).getDataType()) != null);
return dt instanceof AbstractStringDataType || (dt instanceof Array arrayDt &&
ArrayStringable.getArrayStringable(arrayDt.getDataType()) != null);
}
/**
@ -161,13 +159,12 @@ public class StringDataInstance {
return NULL_INSTANCE;
}
DataType dt = data.getBaseDataType();
if (dt instanceof AbstractStringDataType) {
return ((AbstractStringDataType) dt).getStringDataInstance(data, data,
data.getLength());
if (dt instanceof AbstractStringDataType asdt) {
return asdt.getStringDataInstance(data, data, data.getLength());
}
if (dt instanceof Array && data.isInitializedMemory()) {
if (dt instanceof Array arrayDt && data.isInitializedMemory()) {
ArrayStringable arrayStringable =
ArrayStringable.getArrayStringable(((Array) dt).getDataType());
ArrayStringable.getArrayStringable(arrayDt.getDataType());
if (arrayStringable != null && arrayStringable.hasStringValue(data)) {
return new StringDataInstance(arrayStringable, data, data, data.getLength(), true);
}
@ -188,15 +185,15 @@ public class StringDataInstance {
*/
public static StringDataInstance getStringDataInstance(DataType dataType, MemBuffer buf,
Settings settings, int length) {
if (dataType instanceof AbstractStringDataType) {
return ((AbstractStringDataType) dataType).getStringDataInstance(buf, settings, length);
if (dataType instanceof AbstractStringDataType asdt) {
return asdt.getStringDataInstance(buf, settings, length);
}
boolean isArray = dataType instanceof Array;
if (isArray) {
dataType = ArrayStringable.getArrayStringable(((Array) dataType).getDataType());
}
if (dataType instanceof ArrayStringable &&
((ArrayStringable) dataType).hasStringValue(settings) && buf.isInitializedMemory()) {
if (dataType instanceof ArrayStringable arrayStringable &&
arrayStringable.hasStringValue(settings) && buf.isInitializedMemory()) {
// this could be either a charsequence or an array of char elements
return new StringDataInstance(dataType, settings, buf, length, isArray);
@ -250,11 +247,6 @@ public class StringDataInstance {
public static final String UNKNOWN = "??";
public static final String UNKNOWN_DOT_DOT_DOT = "??...";
/**
* A string with a single char that is the Byte-Order-Mark character.
*/
private static final String BOM_RESULT_STR = "\ufeff";
static final int SIZEOF_PASCAL255_STR_LEN_FIELD = 1;
static final int SIZEOF_PASCAL64k_STR_LEN_FIELD = 2;
@ -339,8 +331,7 @@ public class StringDataInstance {
private static String getTranslatedValue(Settings settings, MemBuffer buf) {
// Translation only exists for defined Data which corresponds to settings.
if (settings instanceof Data) {
Data data = (Data) settings;
if (settings instanceof Data data) {
if (data.isDefined()) {
return TRANSLATION.getTranslatedValue(data);
}
@ -374,8 +365,8 @@ public class StringDataInstance {
}
private static StringLayoutEnum getLayoutFromDataType(DataType dataType) {
if (dataType instanceof AbstractStringDataType) {
return ((AbstractStringDataType) dataType).getStringLayout();
if (dataType instanceof AbstractStringDataType asdt) {
return asdt.getStringLayout();
}
if (dataType instanceof AbstractIntegerDataType || dataType instanceof BitFieldDataType) {
return StringLayoutEnum.CHAR_SEQ;
@ -384,11 +375,11 @@ public class StringDataInstance {
}
static String getCharsetNameFromDataTypeOrSettings(DataType dataType, Settings settings) {
if (dataType instanceof BitFieldDataType) {
dataType = ((BitFieldDataType) dataType).getBaseDataType();
if (dataType instanceof BitFieldDataType bfdt) {
dataType = bfdt.getBaseDataType();
}
return (dataType instanceof DataTypeWithCharset)
? ((DataTypeWithCharset) dataType).getCharsetName(settings)
return (dataType instanceof DataTypeWithCharset dtwcs)
? dtwcs.getCharsetName(settings)
: DEFAULT_CHARSET_NAME;
}
@ -586,12 +577,16 @@ public class StringDataInstance {
}
byte[] stringBytes = convertPaddedToUnpadded(getStringBytes());
if (stringBytes == null) {
return StringDataInstance.UNKNOWN_DOT_DOT_DOT;
return UNKNOWN_DOT_DOT_DOT;
}
AdjustedCharsetInfo aci = getAdjustedCharsetInfo(stringBytes);
String str = convertBytesToString(stringBytes, aci);
ByteBuffer bb = ByteBuffer.wrap(stringBytes);
String adjustedCharsetName = getAdjustedCharsetInfo(bb); // force BE or LE variants of UTF charsets
return str;
if (!Charset.isSupported(adjustedCharsetName)) {
return UNKNOWN_DOT_DOT_DOT;
}
Charset cs = Charset.forName(adjustedCharsetName);
return new String(stringBytes, cs);
}
private byte[] getStringBytes() {
@ -709,13 +704,6 @@ public class StringDataInstance {
return buf.isBigEndian() ? Endian.BIG : Endian.LITTLE;
}
private String convertBytesToString(byte[] bytes, AdjustedCharsetInfo aci) {
Charset cs = Charset.isSupported(aci.charsetName) ? Charset.forName(aci.charsetName) : null;
return (cs != null)
? new String(bytes, aci.byteStartOffset, bytes.length - aci.byteStartOffset, cs)
: convertBytesToStringCustomCharset(bytes, aci);
}
private AdjustedCharsetInfo getAdjustedCharsetInfo() {
if (length == -1 && getStringLength() == -1) {
return getAdjustedCharsetInfo(new byte[] {});
@ -751,30 +739,21 @@ public class StringDataInstance {
return result;
}
private static DataConverter getDataConverter(Endian endian) {
return endian == Endian.BIG ? BigEndianDataConverter.INSTANCE
: LittleEndianDataConverter.INSTANCE;
}
/*
* Converts a byte array to String based on a custom Ghidra charset name.
*/
private static String convertBytesToStringCustomCharset(byte[] bytes, AdjustedCharsetInfo aci) {
switch (aci.charsetName) {
case "UTF-32LE":
case "UTF-32BE":
// fall-back because real jvm supplied UTF-32 Charset isn't guaranteed to be present
DataConverter dc = getDataConverter(aci.endian);
int[] codePoints = new int[(bytes.length - aci.byteStartOffset) / 4];
for (int i = 0; i < codePoints.length; i++) {
codePoints[i] = dc.getInt(bytes, aci.byteStartOffset + (i * 4));
if (codePoints[i] < 0 || codePoints[i] > Character.MAX_CODE_POINT) {
codePoints[i] = StringUtilities.UNICODE_REPLACEMENT;
}
}
return new String(codePoints, 0, codePoints.length);
private String getAdjustedCharsetInfo(ByteBuffer bb) {
String result = charsetName;
if (CharsetInfo.isBOMCharset(charsetName)) {
Endian endian = getEndiannessFromBOM(bb, charSize);
if (endian == null) {
endian = endianSetting;
}
if (endian == null) {
endian = getMemoryEndianness();
}
// add "LE" or "BE" to end of charset's name depending
// of the discovered endianness of the string
result += endian.toShortString();
}
return null;
return result;
}
private static Endian getEndiannessFromBOM(byte[] bytes, int charSize) {
@ -792,6 +771,25 @@ public class StringDataInstance {
return null;
}
private static Endian getEndiannessFromBOM(ByteBuffer bb, int charSize) {
if (bb.remaining() < charSize) {
return null;
}
byte[] bytes = new byte[charSize];
bb.get(0, bytes);
int be_val = (int) BigEndianDataConverter.INSTANCE.getValue(bytes, charSize);
switch (be_val) {
case StringUtilities.UNICODE_BE_BYTE_ORDER_MARK:
return Endian.BIG;
case StringUtilities.UNICODE_LE16_BYTE_ORDER_MARK:
case StringUtilities.UNICODE_LE32_BYTE_ORDER_MARK:
return Endian.LITTLE;
}
return null;
}
/**
* Returns a formatted version of the string returned by {@link #getStringValue()}.
* <p>
@ -806,8 +804,8 @@ public class StringDataInstance {
*/
public String getStringRepresentation() {
return showTranslation && translatedValue != null
? getTranslatedStringRepresentation(translatedValue)
: getStringRep(StringRenderBuilder.DOUBLE_QUOTE, StringRenderBuilder.DOUBLE_QUOTE);
? getTranslatedStringRepresentation(translatedValue)
: getStringRep(StringRenderBuilder.DOUBLE_QUOTE);
}
/**
@ -824,13 +822,15 @@ public class StringDataInstance {
* @return formatted String
*/
public String getStringRepresentation(boolean originalOrTranslated) {
if (!originalOrTranslated && translatedValue == null) {
return UNKNOWN;
}
return originalOrTranslated
? getStringRep(StringRenderBuilder.DOUBLE_QUOTE, StringRenderBuilder.DOUBLE_QUOTE)
: translatedValue != null ? getTranslatedStringRepresentation(translatedValue)
: UNKNOWN;
? getStringRep(StringRenderBuilder.DOUBLE_QUOTE)
: getTranslatedStringRepresentation(translatedValue);
}
private String getStringRep(char quoteChar, char quoteCharMulti) {
private String getStringRep(char quoteChar) {
if (isProbe() || isBadCharSize() || !buf.isInitializedMemory()) {
return UNKNOWN;
@ -840,122 +840,22 @@ public class StringDataInstance {
if (stringBytes == null) {
return UNKNOWN_DOT_DOT_DOT;
}
AdjustedCharsetInfo aci = getAdjustedCharsetInfo(stringBytes);
String stringValue = convertBytesToString(stringBytes, aci);
if (stringValue == null) {
ByteBuffer bb = ByteBuffer.wrap(stringBytes);
String adjustedCharsetName = getAdjustedCharsetInfo(bb); // force BE or LE variants of UTF charsets
StringRenderBuilder renderer =
new StringRenderBuilder(adjustedCharsetName.startsWith("UTF"), charSize, quoteChar);
if (!Charset.isSupported(adjustedCharsetName)) {
return UNKNOWN_DOT_DOT_DOT;
}
if (stringValue.length() == 0 && aci.byteStartOffset != 0) {
// If the byteStartOffset isn't zero it means there was one char that was the unicode BOM.
// Asking the Charset to decode it returned nothing, so force it.
stringValue = BOM_RESULT_STR;
}
// if we get the same number of characters out that we put into the decoder,
// then its a good chance there is a one-to-one correspondence between original char
// offsets and decoded char offsets.
boolean isByteToStringCharEquiv =
stringValue.length() == ((stringBytes.length - aci.byteStartOffset) / charSize);
stringValue = stringLayout.shouldTrimTrailingNulls() ? trimNulls(stringValue) : stringValue;
StringRenderBuilder strBuf = new StringRenderBuilder(charSize,
stringValue.length() == 1 ? quoteChar : quoteCharMulti);
if (stringValue.isEmpty() || (stringValue.length() == 1 && stringValue.charAt(0) == 0)) {
// force the string renderer into "string" mode so we get empty quotes when done.
strBuf.addString("");
}
// For each 32bit character in the java string try to add it to the StringRenderBuilder
for (int i = 0, strLength = stringValue.length(); i < strLength;) {
int codePoint = stringValue.codePointAt(i);
RENDER_ENUM currentCharRenderSetting = renderSetting;
if (codePoint == StringUtilities.UNICODE_REPLACEMENT && isByteToStringCharEquiv &&
!isReplacementCharAt(stringBytes, i * charSize + aci.byteStartOffset)) {
// if this is a true decode error and we can recover the original bytes,
// then force the render mode to byte seq.
currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
}
if (StringUtilities.isControlCharacterOrBackslash(codePoint)) {
strBuf.addString(StringUtilities.convertCodePointToEscapeSequence(codePoint));
}
else if (codePoint == 0x0000 && renderSetting != RENDER_ENUM.BYTE_SEQ) {
strBuf.addEscapedChar('0');
}
else if (StringUtilities.isDisplayable(codePoint)) {
strBuf.addCodePointChar(codePoint);
}
else {
// not simple ascii, decide how to handle:
// add the character to the string in a format depending on the
// render settings. ISO control chars are forced to be
// escaped regardless of the render setting.
if (currentCharRenderSetting == RENDER_ENUM.ALL) {
if (codePoint <= ASCII_MAX) {
// render non-displayable, non-control-char ascii-ish bytes as bytes instead
// of as escape sequences
currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
}
else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
codePoint == StringUtilities.UNICODE_BE_BYTE_ORDER_MARK) {
currentCharRenderSetting = RENDER_ENUM.ESC_SEQ;
}
}
switch (currentCharRenderSetting) {
case ALL:
strBuf.addCodePointChar(codePoint);
break;
case BYTE_SEQ:
strBuf.addByteSeq(getOriginalBytes(isByteToStringCharEquiv, i, codePoint,
stringBytes, aci));
break;
case ESC_SEQ:
strBuf.addEscapedCodePoint(codePoint);
break;
}
}
i += Character.charCount(codePoint);
}
String prefix = "";
if (charsetName.startsWith("UTF") && strBuf.startsWithQuotedText()) {
switch (charSize) {
case 1:
prefix = "u8";
break;
case 2:
prefix = "u";
break;
case 4:
prefix = "U";
break;
}
}
return prefix + strBuf.toString();
}
private byte[] getOriginalBytes(boolean isByteToStringCharEquiv, int charOffset, int codePoint,
byte[] stringBytes, AdjustedCharsetInfo aci) {
if (isByteToStringCharEquiv) {
byte[] originalCharBytes = new byte[charSize];
System.arraycopy(stringBytes, charOffset * charSize + aci.byteStartOffset,
originalCharBytes, 0, charSize);
return originalCharBytes;
}
// can't get original bytes, cheat and run the codePoint through the charset
// to get what should be the same as the original bytes.
String singleCharStr = new String(new int[] { codePoint }, 0, 1);
Charset cs = Charset.isSupported(aci.charsetName) ? Charset.forName(aci.charsetName) : null;
if (cs == null || !cs.canEncode()) {
return null;
}
return singleCharStr.getBytes(cs);
Charset cs = Charset.forName(adjustedCharsetName);
renderer.decodeBytesUsingCharset(bb, cs, renderSetting,
stringLayout.shouldTrimTrailingNulls());
String result = renderer.build();
return result;
}
/**
@ -1024,17 +924,11 @@ public class StringDataInstance {
StringDataInstance charseqSDI =
new StringDataInstance(this, StringLayoutEnum.CHAR_SEQ, buf, length, newCSName);
return charseqSDI.getStringRep(StringRenderBuilder.SINGLE_QUOTE,
StringRenderBuilder.DOUBLE_QUOTE);
}
char quoteChar = length == charSize
? StringRenderBuilder.SINGLE_QUOTE
: StringRenderBuilder.DOUBLE_QUOTE;
private boolean isReplacementCharAt(byte[] stringBytes, int byteOffset) {
if (byteOffset + charSize > stringBytes.length) {
return false;
}
long origCodePointValue = DataConverter.getInstance(buf.isBigEndian())
.getValue(stringBytes, byteOffset, charSize);
return origCodePointValue == StringUtilities.UNICODE_REPLACEMENT;
return charseqSDI.getStringRep(quoteChar);
}
private static String getTranslatedStringRepresentation(String translatedString) {

View File

@ -15,16 +15,20 @@
*/
package ghidra.program.model.data;
import ghidra.util.StringFormat;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;
import ghidra.program.model.data.RenderUnicodeSettingsDefinition.RENDER_ENUM;
import ghidra.util.StringUtilities;
/**
* Helper class used to build up a formatted (for human consumption) string representation returned
* by Unicode and String data types.
* <p>
* Call {@link #toString()} to retrieve the formatted string.
* Call {@link #build()} to retrieve the formatted string.
* <p>
* Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
* Example (quotes are part of result): {@code "Test\tstring",01h,02h,"Second\npart"}
*
*/
public class StringRenderBuilder {
@ -32,111 +36,29 @@ public class StringRenderBuilder {
public static final char SINGLE_QUOTE = '\'';
private static final int MAX_ASCII = 0x80;
private StringBuilder sb = new StringBuilder();
private boolean byteMode = true;
private final char quoteChar;
private final StringBuilder sb = new StringBuilder();
private final int charSize;
private final boolean utfCharset;
private final char quoteChar;
private boolean byteMode = true;
public StringRenderBuilder(int charSize) {
this(charSize, DOUBLE_QUOTE);
public StringRenderBuilder(boolean utfCharset, int charSize) {
this(utfCharset, charSize, DOUBLE_QUOTE);
}
public StringRenderBuilder(int charSize, char quoteChar) {
public StringRenderBuilder(boolean utfCharset, int charSize, char quoteChar) {
this.charSize = charSize;
this.utfCharset = utfCharset;
this.quoteChar = quoteChar;
}
/**
* Returns true if the current formatted string starts with a quoted text section,
* instead of a byte value section. Useful to indicate if
* the string could have a prefix applied to it (ie. u8"text")
* <p>
*
* @return boolean true if this string will start with a quoted text section
*/
public boolean startsWithQuotedText() {
return sb.length() > 0 && sb.charAt(0) == quoteChar;
}
/**
* Append the characters in the specified string. The added characters will
* be shown in a quoted text region.
*
* @param str String to add
*/
public void addString(String str) {
ensureTextMode();
sb.append(str);
}
/**
* Append the specified char after an escaping backslash "\", ie
* {@literal "x" -> "\x";}
*
* @param ch
*/
public void addEscapedChar(char ch) {
ensureTextMode();
sb.append("\\").append(ch);
}
/**
* Add a single character. It will be shown in a quoted text region.
*
* @param codePoint Character to add
*/
public void addCodePointChar(int codePoint) {
ensureTextMode();
if (codePoint == quoteChar) {
sb.append("\\");
}
sb.appendCodePoint(codePoint);
}
/**
* Add a single character that needs to be shown as a numeric hex value.
*
* @param codePoint Character to add
*/
public void addCodePointValue(int codePoint) {
ensureByteMode();
String valStr = Integer.toHexString(codePoint).toUpperCase();
valStr = (valStr.length() < charSize * 2)
? StringFormat.padIt(valStr, charSize * 2, (char) 0, true)
: valStr;
sb.append(valStr);
}
/**
* Add byte values, shown as numeric hex values.
* <p>
* {@literal { 0, 1, 2 } -> 00,01,02}
*
* @param bytes to convert to hex and append. If null, append "???"
*/
public void addByteSeq(byte[] bytes) {
if (bytes == null) {
ensureByteMode();
sb.append("???");
return;
}
for (int i = 0; i < bytes.length; i++) {
ensureByteMode();
String valStr = Integer.toHexString(bytes[i] & 0xff).toUpperCase();
if (valStr.length() < 2) {
sb.append("0");
}
sb.append(valStr).append("h");
}
}
/**
* Add an unicode codepoint as its escaped hex value, with a escape character
* Add a unicode codepoint as its escaped hex value, with a escape character
* prefix of 'x', 'u' or 'U' depending on the magnitude of the codePoint value.
* <p>
* {@literal codePoint 15 -> '\' 'x' "0F"}<br>
* {@literal codePoint 65535 -> '\' 'u' "FFFF"}<br>
* {@literal codePoint 65536 -> '\' 'U' "10000"}<br>
* {@literal codePoint 65536 -> '\' 'U' "00010000"}<br>
*
* @param codePoint int value
*/
@ -151,18 +73,117 @@ public class StringRenderBuilder {
}
/**
* Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
* Adds the characters found in the supplied {@link ByteBuffer} to the result.
* <p>
* @return Formatted string
* Any portions of the byte buffer that cause problems for the charset codec will be added
* as a {@link #addByteSeq(ByteBuffer, int) byte sequence}.
* <p>
* Characters that are outside the traditional ASCII range will be rendered as-is or as
* escape sequences, depending on the RENDER_ENUM setting.
*
* @param bb {@link ByteBuffer} containing bytes of a string
* @param cs {@link Charset} that should be used to decode the bytes
* @param renderSetting {@link RENDER_ENUM}
* @param trimTrailingNulls boolean flag, if true trailing null bytes will not be included
* in the rendered output
*/
@Override
public String toString() {
String str = sb.toString();
if (!byteMode) {
// close the quoted text mode in the local string
str = str + quoteChar;
public void decodeBytesUsingCharset(ByteBuffer bb, Charset cs, RENDER_ENUM renderSetting,
boolean trimTrailingNulls) {
CharsetDecoder codec = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
CharBuffer cb = CharBuffer.allocate(Math.min(10, bb.remaining()));
while (bb.hasRemaining()) {
CoderResult cr = codec.decode(bb, cb, true);
if (!bb.hasRemaining() && trimTrailingNulls) {
// if this is the last chunk of text, trim nulls if necessary
// TODO: this conditional is a bit fragile and could fail to trigger if a long
// run of trailing nulls was split over multiple charbuffers. This shouldn't
// happen because of the allocated size of the charbuffer is tied to the size of the
// input bytebuffer
trimTrailingNulls(cb);
}
flushStringModeCharBuf(cb, renderSetting);
if ( cr.isError() ) {
addByteSeq(bb, cr.length());
}
else if (cr.isUnderflow()) {
// there was a trailing byte sequence that the charset needs more bytes to
// finish. Since we gave the charset all the bytes, any remaining will have to
// be rendered as bytes values.
// This can also trigger for successful end-of-input, remaining == 0
addByteSeq(bb, bb.remaining());
}
}
return str;
CoderResult flushResult = codec.flush(cb);
if (!flushResult.isUnderflow()) {
// error, should not happen
}
flushStringModeCharBuf(cb, renderSetting);
}
private void addString(String str) {
ensureTextMode();
sb.append(str);
}
private void addCodePointChar(int codePoint) {
ensureTextMode();
if (codePoint == quoteChar) {
sb.append("\\");
}
sb.appendCodePoint(codePoint);
}
private void addByteSeq(ByteBuffer bytes, int count) {
for (int i = 0; i < count; i++) {
ensureByteMode();
sb.append("%02Xh".formatted(bytes.get()));
}
}
private void trimTrailingNulls(CharBuffer cb) {
while (cb.position() > 0 && cb.get(cb.position() - 1) == 0) {
cb.position(cb.position() - 1);
}
}
private void flushStringModeCharBuf(CharBuffer cb, RENDER_ENUM renderSetting) {
cb.flip();
renderChars(cb, renderSetting);
cb.clear();
}
private void renderChars(CharSequence stringValue, RENDER_ENUM renderSetting) {
for (int i = 0, strLength = stringValue.length(); i < strLength;) {
int codePoint = Character.codePointAt(stringValue, i);
if (StringUtilities.isDisplayable(codePoint)) {
addCodePointChar(codePoint);
}
else if (codePoint == 0) {
// TODO: there is an opportunity to make this smarter by not switching from
// byte mode to string mode to add nulls.
addString("\\0");
}
else if (StringUtilities.isControlCharacterOrBackslash(codePoint)) {
addString(StringUtilities.convertCodePointToEscapeSequence(codePoint));
}
else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
codePoint == StringUtilities.UNICODE_BE_BYTE_ORDER_MARK) {
addEscapedCodePoint(codePoint);
}
else if (renderSetting == RENDER_ENUM.ALL) {
addCodePointChar(codePoint);
}
else {
addEscapedCodePoint(codePoint);
}
i += Character.charCount(codePoint);
}
}
private void ensureTextMode() {
@ -186,4 +207,34 @@ public class StringRenderBuilder {
byteMode = true;
}
public String build() {
// TODO: change the string prefix modifier to align with what decompiler does
String s = !sb.isEmpty() ? toString() : "%c%c".formatted(quoteChar, quoteChar); // '' won't make sense
String prefix = "";
if (utfCharset && !s.isEmpty() && s.charAt(0) == quoteChar) {
prefix = switch (charSize) {
case 1 -> "u8";
case 2 -> "u";
case 4 -> "U";
default -> "";
};
}
return prefix + s;
}
/**
* Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
* <p>
* @return Formatted string
*/
@Override
public String toString() {
String str = sb.toString();
if (!byteMode) {
// close the quoted text mode in the local string
str += quoteChar;
}
return str;
}
}

View File

@ -0,0 +1,229 @@
/* ###
* IP: GHIDRA
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ghidra.program.model.data;
import static org.junit.Assert.*;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import org.junit.Test;
import generic.test.AbstractGTest;
import ghidra.program.model.data.RenderUnicodeSettingsDefinition.RENDER_ENUM;
public class StringRenderBuilderTest extends AbstractGTest {
private ByteBuffer bb(int... values) {
return ByteBuffer.wrap(bytes(values));
}
@Test
public void testEmptyString() {
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
String emptyString = srb.build();
assertEquals("\"\"", emptyString);
}
@Test
public void testEmptyWChar2String() {
StringRenderBuilder srb =
new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
String emptyString = srb.build();
assertEquals("u\"\"", emptyString);
}
@Test
public void testEmptyWChar4String() {
StringRenderBuilder srb =
new StringRenderBuilder(true, 4, StringRenderBuilder.DOUBLE_QUOTE);
String emptyString = srb.build();
assertEquals("U\"\"", emptyString);
}
@Test
public void testEmptyStringWithNulls() {
ByteBuffer bb = bb(0, 0, 0);
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String emptyString = srb.build();
assertEquals("\"\"", emptyString);
}
@Test
public void testEmptyStringWithNullsNoTrim() {
ByteBuffer bb = bb(0, 0, 0);
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, false);
String s = srb.build();
assertEquals("\"\\0\\0\\0\"", s);
}
@Test
public void testInteriorNulls() {
ByteBuffer bb = bb('t', 'e', 0, 's', 't', 0);
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("\"te\\0st\"", s);
}
@Test
public void testSimpleString() {
ByteBuffer bb = bb('t', 'e', 's', 't');
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("\"test\"", s);
}
@Test
public void testStandardEscapedChars() {
ByteBuffer bb = bb('t', 'e', 's', 't', '\n', '\t', '\r');
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("\"test\\n\\t\\r\"", s);
}
@Test
public void testQuotedQuotesChars() {
ByteBuffer bb = bb('t', 'e', 's', 't', '"', '1', '2', '3');
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("\"test\\\"123\"", s);
}
@Test
public void testSingleQuoteChars() {
ByteBuffer bb = bb('t', 'e', 's', 't', '\'', '1', '2', '3');
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("\"test'123\"", s);
}
@Test
public void testSimpleStringWithTrailingNulls() {
ByteBuffer bb = bb('t', 'e', 's', 't', 0, 0, 0);
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("\"test\"", s);
}
@Test
public void testSimpleStringWithTrailingNullsNoTrim() {
ByteBuffer bb = bb('t', 'e', 's', 't', 0, 0, 0);
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, false);
String s = srb.build();
assertEquals("\"test\\0\\0\\0\"", s);
}
@Test
public void testUtf8String() {
ByteBuffer bb = bb(0xE1, 0x84, 0xA2); // should decode to \u1122
StringRenderBuilder srb =
new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("u8\"\u1122\"", s);
}
@Test
public void testUtf8NoRenderNonLatinString() {
ByteBuffer bb = bb(0xE1, 0x84, 0xA2); // should decode to \u1122
StringRenderBuilder srb =
new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ESC_SEQ, true);
String s = srb.build();
assertEquals("u8\"\\u1122\"", s); // <- result is \ u 1122
}
@Test
public void testBadBytes_USASCII() {
ByteBuffer bb = bb('t', 'e', 's', 't', 0x80);
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("\"test\",80h", s);
}
@Test
public void testBadBytes_USASCII2() {
// bad bytes in interior of string, switching modes
ByteBuffer bb = bb('t', 'e', 0x80, 's', 't');
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("\"te\",80h,\"st\"", s);
}
@Test
public void testBadBytes_USASCII3() {
// bad bytes at beginning of string
ByteBuffer bb = bb(0x80, 't', 'e', 's', 't');
StringRenderBuilder srb =
new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("80h,\"test\"", s);
}
@Test
public void testTruncatedUtf8() {
ByteBuffer bb = bb('t', 'e', 's', 't', 0xE1, 0x84);
StringRenderBuilder srb =
new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("u8\"test\",E1h,84h", s);
}
@Test
public void testUtf16() {
ByteBuffer bb = bb('t', 0, 'e', 0, 's', 0, 't', 0);
StringRenderBuilder srb =
new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_16LE, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("u\"test\"", s);
}
@Test
public void testUtf16BOM_LE() {
ByteBuffer bb = bb(0xff, 0xfe, 't', 0, 'e', 0, 's', 0, 't', 0);
StringRenderBuilder srb =
new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_16LE, RENDER_ENUM.ALL, true);
String s = srb.build();
assertEquals("u\"\\uFEFFtest\"", s);
}
}