Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WebCmdlets parse XML declaration to get encoding value, if present. #18748

Merged
merged 10 commits into from
Jan 21, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ internal class WebResponseContentMemoryStream : MemoryStream
private bool _isInitialized = false;
private readonly Cmdlet _ownerCmdlet;

#endregion
#endregion Data

#region Constructors
/// <summary>
Expand All @@ -45,47 +45,23 @@ internal WebResponseContentMemoryStream(Stream stream, int initialCapacity, Cmdl
_originalStreamToProxy = stream;
_ownerCmdlet = cmdlet;
}
#endregion
#endregion Constructors

/// <summary>
/// </summary>
public override bool CanRead
{
get
{
return true;
}
}
public override bool CanRead => true;

/// <summary>
/// </summary>
public override bool CanSeek
{
get
{
return true;
}
}
public override bool CanSeek => true;

/// <summary>
/// </summary>
public override bool CanTimeout
{
get
{
return base.CanTimeout;
}
}
public override bool CanTimeout => base.CanTimeout;

/// <summary>
/// </summary>
public override bool CanWrite
{
get
{
return true;
}
}
public override bool CanWrite => true;

/// <summary>
/// </summary>
Expand Down Expand Up @@ -442,15 +418,20 @@ internal static bool TryGetEncoding(string characterSet, out Encoding encoding)
return result;
}

private static readonly Regex s_metaexp = new(
private static readonly Regex s_metaRegex = new(
@"<meta\s.*[^.><]*charset\s*=\s*[""'\n]?(?<charset>[A-Za-z].[^\s""'\n<>]*)[\s""'\n>]",
RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase
);

private static readonly Regex s_xmlRegex = new(
@"<\?xml\s.*[^.><]*encoding\s*=\s*[""'\n]?(?<encoding>[A-Za-z].[^\s""'\n<>]*)[\s""'\n>]",
RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase
);
iSazonov marked this conversation as resolved.
Show resolved Hide resolved

internal static string DecodeStream(Stream stream, ref Encoding encoding)
{
bool isDefaultEncoding = false;
if (encoding == null)
if (encoding is null)
{
// Use the default encoding if one wasn't provided
encoding = ContentHelper.GetDefaultEncoding();
Expand All @@ -460,26 +441,29 @@ internal static string DecodeStream(Stream stream, ref Encoding encoding)
string content = StreamToString(stream, encoding);
if (isDefaultEncoding)
{
do
// Check for a charset attribute on the meta element to override the default
// we only look within the first 1k characters as the meta tag is in the head
// tag which is at the start of the document
Match match = s_metaRegex.Match(content.Substring(0, Math.Min(content.Length, 1024)));

// Check for a encoding attribute on the xml declaration to override the default
// we only look within the first 256 characters as the declaration is in the first
// line of the document
Match match2 = s_xmlRegex.Match(content.Substring(0, Math.Min(content.Length, 256)));
iSazonov marked this conversation as resolved.
Show resolved Hide resolved

if (match.Success || match2.Success)
{
// check for a charset attribute on the meta element to override the default
// we only look within the first 1k characters as the meta tag is in the head
// tag which is at the start of the document
Match match = s_metaexp.Match(content.Substring(0, Math.Min(content.Length, 1024)));
if (match.Success)
{
Encoding localEncoding = null;
string characterSet = match.Groups["charset"].Value;
Encoding localEncoding = null;
string characterSet = string.IsNullOrEmpty(match.Groups["charset"].Value) ? match2.Groups["encoding"].Value : match.Groups["charset"].Value;

if (TryGetEncoding(characterSet, out localEncoding))
{
stream.Seek(0, SeekOrigin.Begin);
content = StreamToString(stream, localEncoding);
// report the encoding used.
encoding = localEncoding;
}
if (TryGetEncoding(characterSet, out localEncoding))
{
stream.Seek(0, SeekOrigin.Begin);
content = StreamToString(stream, localEncoding);
// report the encoding used.
iSazonov marked this conversation as resolved.
Show resolved Hide resolved
encoding = localEncoding;
}
} while (false);
}
}

return content;
Expand Down