WebClient internally uses a WebRequest to do the downloading; and it =
will
use WebRequest.ContentType to search for "charset" header as the enco=
ding.
If the ContentType/charset header doesn't exist or contains invalid
charset, WebClient.Encoding is used (which is Encoding.Default by def=
ault
or you can assign it before hand); however you should be aware that
WebClient.Encoding is used as a fallback, if the response contains a =
valid
encoding, it's always used to decode the returned data.
I'm pretty sure it isn't so. If I set Encoding to (for example) UTF32 =
the
WebClient throws an exception. And if I have a page with an UTF8 chara=
cter
(a page that in the WebRequest IS correctly shown as UTF8 page) and I =
don't
set the Encoder I receive a wrong String.
--- bye
Try this code. It attemps to get the CharacterSet in various ways and f=
alls back to UTF-8. Checking for ContentEncoding may not be necessary a=
s I have yet to see it specified. The code is a bit of cut and paste an=
d you may have to tweak it to get it running.
public string DownloadPage(url)
{
HttpWebRequest req =3D (HttpWebRequest)WebRequest.Create(url);
using (HttpWebResponse resp =3D (HttpWebResponse)req.GetRes=
ponse())
{
using (Stream s =3D resp.GetResponseStream())
{
buffer =3D ReadStream(s);
}
string pageEncoding =3D "";
Encoding e =3D Encoding.UTF8;
if (resp.ContentEncoding !=3D "")
pageEncoding =3D resp.ContentEncoding;
else if (resp.CharacterSet !=3D "")
pageEncoding =3D resp.CharacterSet;
else if (resp.ContentType !=3D "")
pageEncoding =3D GetCharacterSet(resp.ContentTy=
pe);
if(pageEncoding =3D=3D "")
pageEncoding =3D GetCharacterSet(buffer);
if (pageEncoding !=3D "")
{
try
{
e =3D Encoding.GetEncoding(pageEncoding);
}
catch
{
MessageBox.Show("Invalid encoding: " + page=
Encoding);
}
}
string data =3D e.GetString(buffer);
Status =3D "";
return data;
}
}
private string GetCharacterSet(string s)
{
s =3D s.ToUpper();
int start =3D s.LastIndexOf("CHARSET");
if (start =3D=3D -1)
return "";
start =3D s.IndexOf("=3D", start);
if (start =3D=3D -1)
return "";
start++;
s =3D s.Substring(start).Trim();
int end =3D s.Length;
int i =3D s.IndexOf(";");
if (i !=3D -1)
end =3D i;
i =3D s.IndexOf("\"");
if (i !=3D -1 && i < end)
end =3D i;
i =3D s.IndexOf("'");
if (i !=3D -1 && i < end)
end =3D i;
i =3D s.IndexOf("/");
if (i !=3D -1 && i < end)
end =3D i;
return s.Substring(0, end).Trim();
}
private string GetCharacterSet(byte[] data)
{
string s =3D Encoding.Default.GetString(data);
return GetCharacterSet(s);
}
private byte[] ReadStream(Stream s)
{
try
{
byte[] buffer =3D new byte[8096];
using (MemoryStream ms =3D new MemoryStream())
{
while (true)
{
int read =3D s.Read(buffer, 0, buffer.Length);
if (read <=3D 0)
{
CurLength =3D 0;
return ms.ToArray();
}
ms.Write(buffer, 0, read);
CurLength =3D ms.Length;
}
}
}
catch (Exception ex)
{
return null;
}
}
-- =
Happy coding!
Morten Wennevik [C# MVP]