Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WebCmdlets parse XML declaration to get encoding value, if present. #18748

Merged
merged 10 commits into from
Jan 21, 2023
Prev Previous commit
Next Next commit
add regex to detect encoding from xml declaration
  • Loading branch information
CarloToso committed Dec 8, 2022
commit a81c1c8c4d126d2bcc5ef625aae33021534f8b04
Original file line number Diff line number Diff line change
Expand Up @@ -418,15 +418,20 @@ internal static bool TryGetEncoding(string characterSet, out Encoding encoding)
return result;
}

private static readonly Regex s_metaexp = new(
private static readonly Regex s_metaRegex = new(
@"<meta\s.*[^.><]*charset\s*=\s*[""'\n]?(?<charset>[A-Za-z].[^\s""'\n<>]*)[\s""'\n>]",
RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase
);

private static readonly Regex s_xmlRegex = new(
@"<\?xml\s.*[^.><]*encoding\s*=\s*[""'\n]?(?<encoding>[A-Za-z].[^\s""'\n<>]*)[\s""'\n>]",
RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase
);
iSazonov marked this conversation as resolved.
Show resolved Hide resolved

internal static string DecodeStream(Stream stream, ref Encoding encoding)
{
bool isDefaultEncoding = false;
if (encoding == null)
if (encoding is null)
{
// Use the default encoding if one wasn't provided
encoding = ContentHelper.GetDefaultEncoding();
Expand All @@ -441,11 +446,12 @@ internal static string DecodeStream(Stream stream, ref Encoding encoding)
// check for a charset attribute on the meta element to override the default
// we only look within the first 1k characters as the meta tag is in the head
// tag which is at the start of the document
Match match = s_metaexp.Match(content.Substring(0, Math.Min(content.Length, 1024)));
if (match.Success)
Match match = s_metaRegex.Match(content.Substring(0, Math.Min(content.Length, 1024)));
Match match2 = s_xmlRegex.Match(content.Substring(0, Math.Min(content.Length, 256)));
if (match.Success || match2.Success)
{
Encoding localEncoding = null;
string characterSet = match.Groups["charset"].Value;
string characterSet = (string.IsNullOrEmpty(match.Groups["charset"].Value)) ? match2.Groups["encoding"].Value : match.Groups["charset"].Value;

if (TryGetEncoding(characterSet, out localEncoding))
{
Expand Down