Merge remote-tracking branch 'origin/GP-4160_dev747368_charset_logic--SQUASHED'

2024-11-28 23:21:46 +00:00 · 2023-12-27 07:39:45 -05:00 · 2023-12-27 07:39:45 -05:00 · 9226e7e090
commit 9226e7e090
parent fa8eff4d33 164b4fde49
3 changed files with 472 additions and 298 deletions
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringDataInstance.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringDataInstance.java
@ -45,8 +45,6 @@ import ghidra.util.*;
 */
 public class StringDataInstance {

-	private static final int ASCII_MAX = 0x7f;
-
 	/**
 	 * Returns true if the {@link Data} instance is a 'string'.
 	 *
@ -61,8 +59,8 @@ public class StringDataInstance {
 		if (dt instanceof AbstractStringDataType) {
 			return true;
 		}
-		if (dt instanceof Array) {
-			ArrayStringable as = ArrayStringable.getArrayStringable(((Array) dt).getDataType());
+		if (dt instanceof Array arrayDt) {
+			ArrayStringable as = ArrayStringable.getArrayStringable(arrayDt.getDataType());
 			return (as != null) && as.hasStringValue(data);
 		}
 		return false;
@ -79,11 +77,11 @@ public class StringDataInstance {
 	 * @return boolean true if data type is or could be a string
 	 */
 	public static boolean isStringDataType(DataType dt) {
-		if (dt instanceof TypeDef) {
-			dt = ((TypeDef) dt).getBaseDataType();
+		if (dt instanceof TypeDef td) {
+			dt = td.getBaseDataType();
 		}
-		return dt instanceof AbstractStringDataType || (dt instanceof Array &&
-			ArrayStringable.getArrayStringable(((Array) dt).getDataType()) != null);
+		return dt instanceof AbstractStringDataType || (dt instanceof Array arrayDt &&
+			ArrayStringable.getArrayStringable(arrayDt.getDataType()) != null);
 	}

 	/**
@ -161,13 +159,12 @@ public class StringDataInstance {
 			return NULL_INSTANCE;
 		}
 		DataType dt = data.getBaseDataType();
-		if (dt instanceof AbstractStringDataType) {
-			return ((AbstractStringDataType) dt).getStringDataInstance(data, data,
-				data.getLength());
+		if (dt instanceof AbstractStringDataType asdt) {
+			return asdt.getStringDataInstance(data, data, data.getLength());
 		}
-		if (dt instanceof Array && data.isInitializedMemory()) {
+		if (dt instanceof Array arrayDt && data.isInitializedMemory()) {
 			ArrayStringable arrayStringable =
-				ArrayStringable.getArrayStringable(((Array) dt).getDataType());
+				ArrayStringable.getArrayStringable(arrayDt.getDataType());
 			if (arrayStringable != null && arrayStringable.hasStringValue(data)) {
 				return new StringDataInstance(arrayStringable, data, data, data.getLength(), true);
 			}
@ -188,15 +185,15 @@ public class StringDataInstance {
 	 */
 	public static StringDataInstance getStringDataInstance(DataType dataType, MemBuffer buf,
 			Settings settings, int length) {
-		if (dataType instanceof AbstractStringDataType) {
-			return ((AbstractStringDataType) dataType).getStringDataInstance(buf, settings, length);
+		if (dataType instanceof AbstractStringDataType asdt) {
+			return asdt.getStringDataInstance(buf, settings, length);
 		}
 		boolean isArray = dataType instanceof Array;
 		if (isArray) {
 			dataType = ArrayStringable.getArrayStringable(((Array) dataType).getDataType());
 		}
-		if (dataType instanceof ArrayStringable &&
-			((ArrayStringable) dataType).hasStringValue(settings) && buf.isInitializedMemory()) {
+		if (dataType instanceof ArrayStringable arrayStringable &&
+			arrayStringable.hasStringValue(settings) && buf.isInitializedMemory()) {

 			// this could be either a charsequence or an array of char elements
 			return new StringDataInstance(dataType, settings, buf, length, isArray);
@ -250,11 +247,6 @@ public class StringDataInstance {
 	public static final String UNKNOWN = "??";
 	public static final String UNKNOWN_DOT_DOT_DOT = "??...";

-	/**
-	 * A string with a single char that is the Byte-Order-Mark character.
-	 */
-	private static final String BOM_RESULT_STR = "\ufeff";
-
 	static final int SIZEOF_PASCAL255_STR_LEN_FIELD = 1;
 	static final int SIZEOF_PASCAL64k_STR_LEN_FIELD = 2;

@ -339,8 +331,7 @@ public class StringDataInstance {

 	private static String getTranslatedValue(Settings settings, MemBuffer buf) {
 		// Translation only exists for defined Data which corresponds to settings.
-		if (settings instanceof Data) {
-			Data data = (Data) settings;
+		if (settings instanceof Data data) {
 			if (data.isDefined()) {
 				return TRANSLATION.getTranslatedValue(data);
 			}
@ -374,8 +365,8 @@ public class StringDataInstance {
 	}

 	private static StringLayoutEnum getLayoutFromDataType(DataType dataType) {
-		if (dataType instanceof AbstractStringDataType) {
-			return ((AbstractStringDataType) dataType).getStringLayout();
+		if (dataType instanceof AbstractStringDataType asdt) {
+			return asdt.getStringLayout();
 		}
 		if (dataType instanceof AbstractIntegerDataType || dataType instanceof BitFieldDataType) {
 			return StringLayoutEnum.CHAR_SEQ;
@ -384,11 +375,11 @@ public class StringDataInstance {
 	}

 	static String getCharsetNameFromDataTypeOrSettings(DataType dataType, Settings settings) {
-		if (dataType instanceof BitFieldDataType) {
-			dataType = ((BitFieldDataType) dataType).getBaseDataType();
+		if (dataType instanceof BitFieldDataType bfdt) {
+			dataType = bfdt.getBaseDataType();
 		}
-		return (dataType instanceof DataTypeWithCharset)
-				? ((DataTypeWithCharset) dataType).getCharsetName(settings)
+		return (dataType instanceof DataTypeWithCharset dtwcs)
+				? dtwcs.getCharsetName(settings)
 				: DEFAULT_CHARSET_NAME;
 	}

@ -586,12 +577,16 @@ public class StringDataInstance {
 		}
 		byte[] stringBytes = convertPaddedToUnpadded(getStringBytes());
 		if (stringBytes == null) {
-			return StringDataInstance.UNKNOWN_DOT_DOT_DOT;
+			return UNKNOWN_DOT_DOT_DOT;
 		}
-		AdjustedCharsetInfo aci = getAdjustedCharsetInfo(stringBytes);
-		String str = convertBytesToString(stringBytes, aci);
+		ByteBuffer bb = ByteBuffer.wrap(stringBytes);
+		String adjustedCharsetName = getAdjustedCharsetInfo(bb); // force BE or LE variants of UTF charsets

-		return str;
+		if (!Charset.isSupported(adjustedCharsetName)) {
+			return UNKNOWN_DOT_DOT_DOT;
+		}
+		Charset cs = Charset.forName(adjustedCharsetName);
+		return new String(stringBytes, cs);
 	}

 	private byte[] getStringBytes() {
@ -709,13 +704,6 @@ public class StringDataInstance {
 		return buf.isBigEndian() ? Endian.BIG : Endian.LITTLE;
 	}

-	private String convertBytesToString(byte[] bytes, AdjustedCharsetInfo aci) {
-		Charset cs = Charset.isSupported(aci.charsetName) ? Charset.forName(aci.charsetName) : null;
-		return (cs != null)
-				? new String(bytes, aci.byteStartOffset, bytes.length - aci.byteStartOffset, cs)
-				: convertBytesToStringCustomCharset(bytes, aci);
-	}
-
 	private AdjustedCharsetInfo getAdjustedCharsetInfo() {
 		if (length == -1 && getStringLength() == -1) {
 			return getAdjustedCharsetInfo(new byte[] {});
@ -751,30 +739,21 @@ public class StringDataInstance {
 		return result;
 	}

-	private static DataConverter getDataConverter(Endian endian) {
-		return endian == Endian.BIG ? BigEndianDataConverter.INSTANCE
-				: LittleEndianDataConverter.INSTANCE;
-	}
-
-	/*
-	 * Converts a byte array to String based on a custom Ghidra charset name.
-	 */
-	private static String convertBytesToStringCustomCharset(byte[] bytes, AdjustedCharsetInfo aci) {
-		switch (aci.charsetName) {
-			case "UTF-32LE":
-			case "UTF-32BE":
-				// fall-back because real jvm supplied UTF-32 Charset isn't guaranteed to be present
-				DataConverter dc = getDataConverter(aci.endian);
-				int[] codePoints = new int[(bytes.length - aci.byteStartOffset) / 4];
-				for (int i = 0; i < codePoints.length; i++) {
-					codePoints[i] = dc.getInt(bytes, aci.byteStartOffset + (i * 4));
-					if (codePoints[i] < 0 || codePoints[i] > Character.MAX_CODE_POINT) {
-						codePoints[i] = StringUtilities.UNICODE_REPLACEMENT;
-					}
-				}
-				return new String(codePoints, 0, codePoints.length);
+	private String getAdjustedCharsetInfo(ByteBuffer bb) {
+		String result = charsetName;
+		if (CharsetInfo.isBOMCharset(charsetName)) {
+			Endian endian = getEndiannessFromBOM(bb, charSize);
+			if (endian == null) {
+				endian = endianSetting;
+			}
+			if (endian == null) {
+				endian = getMemoryEndianness();
+			}
+			// add "LE" or "BE" to end of charset's name depending
+			// of the discovered endianness of the string
+			result += endian.toShortString();
 		}
-		return null;
+		return result;
 	}

 	private static Endian getEndiannessFromBOM(byte[] bytes, int charSize) {
@ -792,6 +771,25 @@ public class StringDataInstance {
 		return null;
 	}

+	private static Endian getEndiannessFromBOM(ByteBuffer bb, int charSize) {
+		if (bb.remaining() < charSize) {
+			return null;
+		}
+
+		byte[] bytes = new byte[charSize];
+		bb.get(0, bytes);
+
+		int be_val = (int) BigEndianDataConverter.INSTANCE.getValue(bytes, charSize);
+		switch (be_val) {
+			case StringUtilities.UNICODE_BE_BYTE_ORDER_MARK:
+				return Endian.BIG;
+			case StringUtilities.UNICODE_LE16_BYTE_ORDER_MARK:
+			case StringUtilities.UNICODE_LE32_BYTE_ORDER_MARK:
+				return Endian.LITTLE;
+		}
+		return null;
+	}
+
 	/**
 	 * Returns a formatted version of the string returned by {@link #getStringValue()}.
 	 * <p>
@ -806,8 +804,8 @@ public class StringDataInstance {
 	 */
 	public String getStringRepresentation() {
 		return showTranslation && translatedValue != null
-			? getTranslatedStringRepresentation(translatedValue)
-			: getStringRep(StringRenderBuilder.DOUBLE_QUOTE, StringRenderBuilder.DOUBLE_QUOTE);
+				? getTranslatedStringRepresentation(translatedValue)
+				: getStringRep(StringRenderBuilder.DOUBLE_QUOTE);
 	}

 	/**
@ -824,13 +822,15 @@ public class StringDataInstance {
 	 * @return formatted String
 	 */
 	public String getStringRepresentation(boolean originalOrTranslated) {
+		if (!originalOrTranslated && translatedValue == null) {
+			return UNKNOWN;
+		}
 		return originalOrTranslated
-				? getStringRep(StringRenderBuilder.DOUBLE_QUOTE, StringRenderBuilder.DOUBLE_QUOTE)
-				: translatedValue != null ? getTranslatedStringRepresentation(translatedValue)
-				: UNKNOWN;
+				? getStringRep(StringRenderBuilder.DOUBLE_QUOTE)
+				: getTranslatedStringRepresentation(translatedValue);
 	}

-	private String getStringRep(char quoteChar, char quoteCharMulti) {
+	private String getStringRep(char quoteChar) {

 		if (isProbe() || isBadCharSize() || !buf.isInitializedMemory()) {
 			return UNKNOWN;
@ -840,122 +840,22 @@ public class StringDataInstance {
 		if (stringBytes == null) {
 			return UNKNOWN_DOT_DOT_DOT;
 		}
-		AdjustedCharsetInfo aci = getAdjustedCharsetInfo(stringBytes);
-		String stringValue = convertBytesToString(stringBytes, aci);
-		if (stringValue == null) {
+
+		ByteBuffer bb = ByteBuffer.wrap(stringBytes);
+		String adjustedCharsetName = getAdjustedCharsetInfo(bb); // force BE or LE variants of UTF charsets
+
+		StringRenderBuilder renderer =
+			new StringRenderBuilder(adjustedCharsetName.startsWith("UTF"), charSize, quoteChar);
+
+		if (!Charset.isSupported(adjustedCharsetName)) {
 			return UNKNOWN_DOT_DOT_DOT;
 		}
-
-		if (stringValue.length() == 0 && aci.byteStartOffset != 0) {
-			// If the byteStartOffset isn't zero it means there was one char that was the unicode BOM.
-			// Asking the Charset to decode it returned nothing, so force it.
-			stringValue = BOM_RESULT_STR;
-		}
-
-		// if we get the same number of characters out that we put into the decoder,
-		// then its a good chance there is a one-to-one correspondence between original char
-		// offsets and decoded char offsets.
-		boolean isByteToStringCharEquiv =
-			stringValue.length() == ((stringBytes.length - aci.byteStartOffset) / charSize);
-
-		stringValue = stringLayout.shouldTrimTrailingNulls() ? trimNulls(stringValue) : stringValue;
-
-		StringRenderBuilder strBuf = new StringRenderBuilder(charSize,
-			stringValue.length() == 1 ? quoteChar : quoteCharMulti);
-
-		if (stringValue.isEmpty() || (stringValue.length() == 1 && stringValue.charAt(0) == 0)) {
-			// force the string renderer into "string" mode so we get empty quotes when done.
-			strBuf.addString("");
-		}
-
-		// For each 32bit character in the java string try to add it to the StringRenderBuilder
-		for (int i = 0, strLength = stringValue.length(); i < strLength;) {
-			int codePoint = stringValue.codePointAt(i);
-
-			RENDER_ENUM currentCharRenderSetting = renderSetting;
-			if (codePoint == StringUtilities.UNICODE_REPLACEMENT && isByteToStringCharEquiv &&
-				!isReplacementCharAt(stringBytes, i * charSize + aci.byteStartOffset)) {
-				// if this is a true decode error and we can recover the original bytes,
-				// then force the render mode to byte seq.
-				currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
-			}
-
-			if (StringUtilities.isControlCharacterOrBackslash(codePoint)) {
-				strBuf.addString(StringUtilities.convertCodePointToEscapeSequence(codePoint));
-			}
-			else if (codePoint == 0x0000 && renderSetting != RENDER_ENUM.BYTE_SEQ) {
-				strBuf.addEscapedChar('0');
-			}
-			else if (StringUtilities.isDisplayable(codePoint)) {
-				strBuf.addCodePointChar(codePoint);
-			}
-			else {
-				// not simple ascii, decide how to handle:
-				// add the character to the string in a format depending on the
-				// render settings.  ISO control chars are forced to be
-				// escaped regardless of the render setting.
-				if (currentCharRenderSetting == RENDER_ENUM.ALL) {
-					if (codePoint <= ASCII_MAX) {
-						// render non-displayable, non-control-char ascii-ish bytes as bytes instead
-						// of as escape sequences
-						currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
-					}
-					else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
-						codePoint == StringUtilities.UNICODE_BE_BYTE_ORDER_MARK) {
-						currentCharRenderSetting = RENDER_ENUM.ESC_SEQ;
-					}
-				}
-
-				switch (currentCharRenderSetting) {
-					case ALL:
-						strBuf.addCodePointChar(codePoint);
-						break;
-					case BYTE_SEQ:
-						strBuf.addByteSeq(getOriginalBytes(isByteToStringCharEquiv, i, codePoint,
-							stringBytes, aci));
-						break;
-					case ESC_SEQ:
-						strBuf.addEscapedCodePoint(codePoint);
-						break;
-				}
-			}
-			i += Character.charCount(codePoint);
-		}
-		String prefix = "";
-		if (charsetName.startsWith("UTF") && strBuf.startsWithQuotedText()) {
-			switch (charSize) {
-				case 1:
-					prefix = "u8";
-					break;
-				case 2:
-					prefix = "u";
-					break;
-				case 4:
-					prefix = "U";
-					break;
-			}
-		}
-		return prefix + strBuf.toString();
-	}
-
-	private byte[] getOriginalBytes(boolean isByteToStringCharEquiv, int charOffset, int codePoint,
-			byte[] stringBytes, AdjustedCharsetInfo aci) {
-
-		if (isByteToStringCharEquiv) {
-			byte[] originalCharBytes = new byte[charSize];
-			System.arraycopy(stringBytes, charOffset * charSize + aci.byteStartOffset,
-				originalCharBytes, 0, charSize);
-			return originalCharBytes;
-		}
-
-		// can't get original bytes, cheat and run the codePoint through the charset
-		// to get what should be the same as the original bytes.
-		String singleCharStr = new String(new int[] { codePoint }, 0, 1);
-		Charset cs = Charset.isSupported(aci.charsetName) ? Charset.forName(aci.charsetName) : null;
-		if (cs == null || !cs.canEncode()) {
-			return null;
-		}
-		return singleCharStr.getBytes(cs);
+		Charset cs = Charset.forName(adjustedCharsetName);
+		renderer.decodeBytesUsingCharset(bb, cs, renderSetting,
+			stringLayout.shouldTrimTrailingNulls());
+		
+		String result = renderer.build();
+		return result;
 	}

 	/**
@ -1024,17 +924,11 @@ public class StringDataInstance {
 		StringDataInstance charseqSDI =
 			new StringDataInstance(this, StringLayoutEnum.CHAR_SEQ, buf, length, newCSName);

-		return charseqSDI.getStringRep(StringRenderBuilder.SINGLE_QUOTE,
-			StringRenderBuilder.DOUBLE_QUOTE);
-	}
+		char quoteChar = length == charSize
+				? StringRenderBuilder.SINGLE_QUOTE
+				: StringRenderBuilder.DOUBLE_QUOTE;

-	private boolean isReplacementCharAt(byte[] stringBytes, int byteOffset) {
-		if (byteOffset + charSize > stringBytes.length) {
-			return false;
-		}
-		long origCodePointValue = DataConverter.getInstance(buf.isBigEndian())
-				.getValue(stringBytes, byteOffset, charSize);
-		return origCodePointValue == StringUtilities.UNICODE_REPLACEMENT;
+		return charseqSDI.getStringRep(quoteChar);
 	}

 	private static String getTranslatedStringRepresentation(String translatedString) {
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringRenderBuilder.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringRenderBuilder.java
@ -15,16 +15,20 @@
 */
 package ghidra.program.model.data;

-import ghidra.util.StringFormat;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.*;
+
+import ghidra.program.model.data.RenderUnicodeSettingsDefinition.RENDER_ENUM;
 import ghidra.util.StringUtilities;

 /**
 * Helper class used to build up a formatted (for human consumption) string representation returned
 * by Unicode and String data types.
 * <p>
- * Call {@link #toString()} to retrieve the formatted string.
+ * Call {@link #build()} to retrieve the formatted string.
 * <p>
- * Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
+ * Example (quotes are part of result): {@code "Test\tstring",01h,02h,"Second\npart"}
 *
 */
 public class StringRenderBuilder {
@ -32,111 +36,29 @@ public class StringRenderBuilder {
 	public static final char SINGLE_QUOTE = '\'';
 	private static final int MAX_ASCII = 0x80;

-	private StringBuilder sb = new StringBuilder();
-	private boolean byteMode = true;
-	private final char quoteChar;
+	private final StringBuilder sb = new StringBuilder();
 	private final int charSize;
+	private final boolean utfCharset;
+	private final char quoteChar;
+	private boolean byteMode = true;

-	public StringRenderBuilder(int charSize) {
-		this(charSize, DOUBLE_QUOTE);
+	public StringRenderBuilder(boolean utfCharset, int charSize) {
+		this(utfCharset, charSize, DOUBLE_QUOTE);
 	}

-	public StringRenderBuilder(int charSize, char quoteChar) {
+	public StringRenderBuilder(boolean utfCharset, int charSize, char quoteChar) {
 		this.charSize = charSize;
+		this.utfCharset = utfCharset;
 		this.quoteChar = quoteChar;
 	}

 	/**
-	 * Returns true if the current formatted string starts with a quoted text section,
-	 * instead of a byte value section.  Useful to indicate if
-	 * the string could have a prefix applied to it (ie. u8"text")
-	 * <p>
-	 * 
-	 * @return boolean true if this string will start with a quoted text section
-	 */
-	public boolean startsWithQuotedText() {
-		return sb.length() > 0 && sb.charAt(0) == quoteChar;
-	}
-
-	/**
-	 * Append the characters in the specified string. The added characters will
-	 * be shown in a quoted text region.
-	 *
-	 * @param str String to add
-	 */
-	public void addString(String str) {
-		ensureTextMode();
-		sb.append(str);
-	}
-
-	/**
-	 * Append the specified char after an escaping backslash "\", ie
-	 * {@literal "x" -> "\x";}
-	 *
-	 * @param ch
-	 */
-	public void addEscapedChar(char ch) {
-		ensureTextMode();
-		sb.append("\\").append(ch);
-	}
-
-	/**
-	 * Add a single character.  It will be shown in a quoted text region.
-	 *
-	 * @param codePoint Character to add
-	 */
-	public void addCodePointChar(int codePoint) {
-		ensureTextMode();
-		if (codePoint == quoteChar) {
-			sb.append("\\");
-		}
-		sb.appendCodePoint(codePoint);
-	}
-
-	/**
-	 * Add a single character that needs to be shown as a numeric hex value.
-	 *
-	 * @param codePoint Character to add
-	 */
-	public void addCodePointValue(int codePoint) {
-		ensureByteMode();
-		String valStr = Integer.toHexString(codePoint).toUpperCase();
-		valStr = (valStr.length() < charSize * 2)
-				? StringFormat.padIt(valStr, charSize * 2, (char) 0, true)
-				: valStr;
-		sb.append(valStr);
-	}
-
-	/**
-	 * Add byte values, shown as numeric hex values.
-	 * <p>
-	 * {@literal { 0, 1, 2 } -> 00,01,02}
-	 *
-	 * @param bytes to convert to hex and append.  If null, append "???"
-	 */
-	public void addByteSeq(byte[] bytes) {
-		if (bytes == null) {
-			ensureByteMode();
-			sb.append("???");
-			return;
-		}
-		for (int i = 0; i < bytes.length; i++) {
-			ensureByteMode();
-			String valStr = Integer.toHexString(bytes[i] & 0xff).toUpperCase();
-			if (valStr.length() < 2) {
-				sb.append("0");
-			}
-			sb.append(valStr).append("h");
-		}
-	}
-
-	/**
-	 * Add an unicode codepoint as its escaped hex value, with a escape character
+	 * Add a unicode codepoint as its escaped hex value, with a escape character
 	 * prefix of 'x', 'u' or 'U' depending on the magnitude of the codePoint value.
 	 * <p>
 	 * {@literal codePoint 15 -> '\' 'x' "0F"}<br>
 	 * {@literal codePoint 65535 -> '\' 'u' "FFFF"}<br>
-	 * {@literal codePoint 65536 -> '\' 'U' "10000"}<br>
+	 * {@literal codePoint 65536 -> '\' 'U' "00010000"}<br>
 	 *
 	 * @param codePoint int value
 	 */
@ -151,18 +73,117 @@ public class StringRenderBuilder {
 	}

 	/**
-	 * Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
+	 * Adds the characters found in the supplied {@link ByteBuffer} to the result.
 	 * <p>
-	 * @return Formatted string
+	 * Any portions of the byte buffer that cause problems for the charset codec will be added
+	 * as a {@link #addByteSeq(ByteBuffer, int) byte sequence}.
+	 * <p>
+	 * Characters that are outside the traditional ASCII range will be rendered as-is or as
+	 * escape sequences, depending on the RENDER_ENUM setting.
+	 *  
+	 * @param bb {@link ByteBuffer} containing bytes of a string
+	 * @param cs {@link Charset} that should be used to decode the bytes
+	 * @param renderSetting {@link RENDER_ENUM}
+	 * @param trimTrailingNulls boolean flag, if true trailing null bytes will not be included
+	 * in the rendered output
 	 */
-	@Override
-	public String toString() {
-		String str = sb.toString();
-		if (!byteMode) {
-			// close the quoted text mode in the local string
-			str = str + quoteChar;
+	public void decodeBytesUsingCharset(ByteBuffer bb, Charset cs, RENDER_ENUM renderSetting,
+			boolean trimTrailingNulls) {
+		CharsetDecoder codec = cs.newDecoder()
+				.onMalformedInput(CodingErrorAction.REPORT)
+				.onUnmappableCharacter(CodingErrorAction.REPORT);
+		CharBuffer cb = CharBuffer.allocate(Math.min(10, bb.remaining()));
+		while (bb.hasRemaining()) {
+			CoderResult cr = codec.decode(bb, cb, true);
+			if (!bb.hasRemaining() && trimTrailingNulls) {
+				// if this is the last chunk of text, trim nulls if necessary
+				// TODO: this conditional is a bit fragile and could fail to trigger if a long
+				// run of trailing nulls was split over multiple charbuffers.  This shouldn't
+				// happen because of the allocated size of the charbuffer is tied to the size of the
+				// input bytebuffer
+				trimTrailingNulls(cb);
+			}
+			flushStringModeCharBuf(cb, renderSetting);
+			if ( cr.isError() ) {
+				addByteSeq(bb, cr.length());
+			}
+			else if (cr.isUnderflow()) {
+				// there was a trailing byte sequence that the charset needs more bytes to
+				// finish. Since we gave the charset all the bytes, any remaining will have to
+				// be rendered as bytes values.
+				// This can also trigger for successful end-of-input, remaining == 0
+				addByteSeq(bb, bb.remaining());
+			}
 		}
-		return str;
+
+		CoderResult flushResult = codec.flush(cb);
+		if (!flushResult.isUnderflow()) {
+			// error, should not happen
+		}
+		flushStringModeCharBuf(cb, renderSetting);
+	}
+
+	private void addString(String str) {
+		ensureTextMode();
+		sb.append(str);
+	}
+
+	private void addCodePointChar(int codePoint) {
+		ensureTextMode();
+		if (codePoint == quoteChar) {
+			sb.append("\\");
+		}
+		sb.appendCodePoint(codePoint);
+	}
+
+	private void addByteSeq(ByteBuffer bytes, int count) {
+		for (int i = 0; i < count; i++) {
+			ensureByteMode();
+			sb.append("%02Xh".formatted(bytes.get()));
+		}
+	}
+
+	private void trimTrailingNulls(CharBuffer cb) {
+		while (cb.position() > 0 && cb.get(cb.position() - 1) == 0) {
+			cb.position(cb.position() - 1);
+		}
+	}
+
+	private void flushStringModeCharBuf(CharBuffer cb, RENDER_ENUM renderSetting) {
+		cb.flip();
+		renderChars(cb, renderSetting);
+		cb.clear();
+	}
+
+	private void renderChars(CharSequence stringValue, RENDER_ENUM renderSetting) {
+		for (int i = 0, strLength = stringValue.length(); i < strLength;) {
+			int codePoint = Character.codePointAt(stringValue, i);
+
+			if (StringUtilities.isDisplayable(codePoint)) {
+				addCodePointChar(codePoint);
+			}
+			else if (codePoint == 0) {
+				// TODO: there is an opportunity to make this smarter by not switching from
+				// byte mode to string mode to add nulls.
+				addString("\\0");
+			}
+			else if (StringUtilities.isControlCharacterOrBackslash(codePoint)) {
+				addString(StringUtilities.convertCodePointToEscapeSequence(codePoint));
+			}
+			else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
+				codePoint == StringUtilities.UNICODE_BE_BYTE_ORDER_MARK) {
+				addEscapedCodePoint(codePoint);
+			}
+			else if (renderSetting == RENDER_ENUM.ALL) {
+				addCodePointChar(codePoint);
+			}
+			else {
+				addEscapedCodePoint(codePoint);
+			}
+
+			i += Character.charCount(codePoint);
+		}
+
 	}

 	private void ensureTextMode() {
@ -186,4 +207,34 @@ public class StringRenderBuilder {
 		byteMode = true;
 	}

+	public String build() {
+		// TODO: change the string prefix modifier to align with what decompiler does 
+		String s = !sb.isEmpty() ? toString() : "%c%c".formatted(quoteChar, quoteChar); // '' won't make sense
+		String prefix = "";
+		if (utfCharset && !s.isEmpty() && s.charAt(0) == quoteChar) {
+			prefix = switch (charSize) {
+				case 1 -> "u8";
+				case 2 -> "u";
+				case 4 -> "U";
+				default -> "";
+			};
+		}
+		return prefix + s;
+	}
+
+	/**
+	 * Example (quotes are part of result): {@code "Test\tstring",01,02,"Second\npart",00}
+	 * <p>
+	 * @return Formatted string
+	 */
+	@Override
+	public String toString() {
+		String str = sb.toString();
+		if (!byteMode) {
+			// close the quoted text mode in the local string
+			str += quoteChar;
+		}
+		return str;
+	}
+
 }
--- a/Ghidra/Framework/SoftwareModeling/src/test/java/ghidra/program/model/data/StringRenderBuilderTest.java
+++ b/Ghidra/Framework/SoftwareModeling/src/test/java/ghidra/program/model/data/StringRenderBuilderTest.java
@ -0,0 +1,229 @@
+/* ###
+ * IP: GHIDRA
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ghidra.program.model.data;
+
+import static org.junit.Assert.*;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.Test;
+
+import generic.test.AbstractGTest;
+import ghidra.program.model.data.RenderUnicodeSettingsDefinition.RENDER_ENUM;
+
+public class StringRenderBuilderTest extends AbstractGTest {
+
+	private ByteBuffer bb(int... values) {
+		return ByteBuffer.wrap(bytes(values));
+	}
+
+	@Test
+	public void testEmptyString() {
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		String emptyString = srb.build();
+		assertEquals("\"\"", emptyString);
+	}
+
+	@Test
+	public void testEmptyWChar2String() {
+		StringRenderBuilder srb =
+			new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
+		String emptyString = srb.build();
+		assertEquals("u\"\"", emptyString);
+	}
+
+	@Test
+	public void testEmptyWChar4String() {
+		StringRenderBuilder srb =
+			new StringRenderBuilder(true, 4, StringRenderBuilder.DOUBLE_QUOTE);
+		String emptyString = srb.build();
+		assertEquals("U\"\"", emptyString);
+	}
+
+	@Test
+	public void testEmptyStringWithNulls() {
+		ByteBuffer bb = bb(0, 0, 0);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String emptyString = srb.build();
+		assertEquals("\"\"", emptyString);
+	}
+
+	@Test
+	public void testEmptyStringWithNullsNoTrim() {
+		ByteBuffer bb = bb(0, 0, 0);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, false);
+		String s = srb.build();
+		assertEquals("\"\\0\\0\\0\"", s);
+	}
+
+	@Test
+	public void testInteriorNulls() {
+		ByteBuffer bb = bb('t', 'e', 0, 's', 't', 0);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("\"te\\0st\"", s);
+	}
+
+	@Test
+	public void testSimpleString() {
+		ByteBuffer bb = bb('t', 'e', 's', 't');
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("\"test\"", s);
+	}
+
+	@Test
+	public void testStandardEscapedChars() {
+		ByteBuffer bb = bb('t', 'e', 's', 't', '\n', '\t', '\r');
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("\"test\\n\\t\\r\"", s);
+	}
+
+	@Test
+	public void testQuotedQuotesChars() {
+		ByteBuffer bb = bb('t', 'e', 's', 't', '"', '1', '2', '3');
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("\"test\\\"123\"", s);
+	}
+
+	@Test
+	public void testSingleQuoteChars() {
+		ByteBuffer bb = bb('t', 'e', 's', 't', '\'', '1', '2', '3');
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("\"test'123\"", s);
+	}
+
+	@Test
+	public void testSimpleStringWithTrailingNulls() {
+		ByteBuffer bb = bb('t', 'e', 's', 't', 0, 0, 0);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("\"test\"", s);
+	}
+
+	@Test
+	public void testSimpleStringWithTrailingNullsNoTrim() {
+		ByteBuffer bb = bb('t', 'e', 's', 't', 0, 0, 0);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, false);
+		String s = srb.build();
+		assertEquals("\"test\\0\\0\\0\"", s);
+	}
+
+	@Test
+	public void testUtf8String() {
+		ByteBuffer bb = bb(0xE1, 0x84, 0xA2); // should decode to \u1122
+		StringRenderBuilder srb =
+			new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("u8\"\u1122\"", s);
+	}
+
+	@Test
+	public void testUtf8NoRenderNonLatinString() {
+		ByteBuffer bb = bb(0xE1, 0x84, 0xA2); // should decode to \u1122
+		StringRenderBuilder srb =
+			new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ESC_SEQ, true);
+		String s = srb.build();
+		assertEquals("u8\"\\u1122\"", s); // <- result is \ u 1122
+	}
+
+	@Test
+	public void testBadBytes_USASCII() {
+		ByteBuffer bb = bb('t', 'e', 's', 't', 0x80);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("\"test\",80h", s);
+	}
+
+	@Test
+	public void testBadBytes_USASCII2() {
+		// bad bytes in interior of string, switching modes
+		ByteBuffer bb = bb('t', 'e', 0x80, 's', 't');
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("\"te\",80h,\"st\"", s);
+	}
+
+	@Test
+	public void testBadBytes_USASCII3() {
+		// bad bytes at beginning of string
+		ByteBuffer bb = bb(0x80, 't', 'e', 's', 't');
+		StringRenderBuilder srb =
+			new StringRenderBuilder(false, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.US_ASCII, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("80h,\"test\"", s);
+	}
+
+	@Test
+	public void testTruncatedUtf8() {
+		ByteBuffer bb = bb('t', 'e', 's', 't', 0xE1, 0x84);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(true, 1, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_8, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("u8\"test\",E1h,84h", s);
+	}
+
+	@Test
+	public void testUtf16() {
+		ByteBuffer bb = bb('t', 0, 'e', 0, 's', 0, 't', 0);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_16LE, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("u\"test\"", s);
+	}
+
+	@Test
+	public void testUtf16BOM_LE() {
+		ByteBuffer bb = bb(0xff, 0xfe, 't', 0, 'e', 0, 's', 0, 't', 0);
+		StringRenderBuilder srb =
+			new StringRenderBuilder(true, 2, StringRenderBuilder.DOUBLE_QUOTE);
+		srb.decodeBytesUsingCharset(bb, StandardCharsets.UTF_16LE, RENDER_ENUM.ALL, true);
+		String s = srb.build();
+		assertEquals("u\"\\uFEFFtest\"", s);
+	}
+}