1 | <?xml version="1.0"?> |
---|
2 | <!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ |
---|
3 | <!ENTITY rfc1738 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.1738.xml"> |
---|
4 | <!ENTITY rfc2045 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2045.xml"> |
---|
5 | <!ENTITY rfc2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml"> |
---|
6 | <!ENTITY rfc2130 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2130.xml"> |
---|
7 | <!ENTITY rfc2141 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2141.xml"> |
---|
8 | <!ENTITY rfc2192 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2192.xml"> |
---|
9 | <!ENTITY rfc2277 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2277.xml"> |
---|
10 | <!ENTITY rfc2368 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2368.xml"> |
---|
11 | <!ENTITY rfc2384 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2384.xml"> |
---|
12 | <!ENTITY rfc2396 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2396.xml"> |
---|
13 | <!ENTITY rfc2397 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2397.xml"> |
---|
14 | <!ENTITY rfc2616 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2616.xml"> |
---|
15 | <!ENTITY rfc2640 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2640.xml"> |
---|
16 | <!ENTITY rfc3490 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3490.xml"> |
---|
17 | <!ENTITY rfc3491 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3491.xml"> |
---|
18 | <!ENTITY rfc3629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3629.xml"> |
---|
19 | <!ENTITY rfc3986 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3986.xml"> |
---|
20 | <!ENTITY rfc3987 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3987.xml"> |
---|
21 | <!ENTITY rfc5890 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5890.xml"> |
---|
22 | <!ENTITY rfc5891 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5891.xml"> |
---|
23 | ]> |
---|
24 | <?rfc strict='yes'?> |
---|
25 | |
---|
26 | <?xml-stylesheet type='text/css' href='rfc2629.css' ?> |
---|
27 | <?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?> |
---|
28 | <?rfc symrefs='yes'?> |
---|
29 | <?rfc sortrefs='yes'?> |
---|
30 | <?rfc iprnotified="no" ?> |
---|
31 | <?rfc toc='yes'?> |
---|
32 | <?rfc compact='yes'?> |
---|
33 | <?rfc subcompact='no'?> |
---|
34 | <rfc ipr="pre5378Trust200902" docName="draft-ietf-iri-3987bis-04" category="std" xml:lang="en" obsoletes="3987"> |
---|
35 | <front> |
---|
36 | <title abbrev="IRIs">Internationalized Resource Identifiers (IRIs)</title> |
---|
37 | |
---|
38 | <author initials="M.J." surname="Duerst" fullname='Martin Duerst'> |
---|
39 | <!-- (Note: Please write "Duerst" with u-umlaut wherever |
---|
40 | possible, for example as "Dürst" in XML and HTML) --> |
---|
41 | <organization abbrev="Aoyama Gakuin University">Aoyama Gakuin University</organization> |
---|
42 | <address> |
---|
43 | <postal> |
---|
44 | <street>5-10-1 Fuchinobe</street> |
---|
45 | <city>Sagamihara</city> |
---|
46 | <region>Kanagawa</region> |
---|
47 | <code>229-8558</code> |
---|
48 | <country>Japan</country> |
---|
49 | </postal> |
---|
50 | <phone>+81 42 759 6329</phone> |
---|
51 | <facsimile>+81 42 759 6495</facsimile> |
---|
52 | <email>duerst@it.aoyama.ac.jp</email> |
---|
53 | <uri>http://www.sw.it.aoyama.ac.jp/D%C3%BCrst/<!-- (Note: This is the percent-encoded form of an IRI)--></uri> |
---|
54 | </address> |
---|
55 | </author> |
---|
56 | |
---|
57 | <author initials="M.L." surname="Suignard" fullname="Michel Suignard"> |
---|
58 | <organization>Unicode Consortium</organization> |
---|
59 | <address> |
---|
60 | <postal> |
---|
61 | <street></street> |
---|
62 | <street>P.O. Box 391476</street> |
---|
63 | <city>Mountain View</city> |
---|
64 | <region>CA</region> |
---|
65 | <code>94039-1476</code> |
---|
66 | <country>U.S.A.</country> |
---|
67 | </postal> |
---|
68 | <phone>+1-650-693-3921</phone> |
---|
69 | <email>michel@unicode.org</email> |
---|
70 | <uri>http://www.suignard.com</uri> |
---|
71 | </address> |
---|
72 | </author> |
---|
73 | <author initials="L." surname="Masinter" fullname="Larry Masinter"> |
---|
74 | <organization>Adobe</organization> |
---|
75 | <address> |
---|
76 | <postal> |
---|
77 | <street>345 Park Ave</street> |
---|
78 | <city>San Jose</city> |
---|
79 | <region>CA</region> |
---|
80 | <code>95110</code> |
---|
81 | <country>U.S.A.</country> |
---|
82 | </postal> |
---|
83 | <phone>+1-408-536-3024</phone> |
---|
84 | <email>masinter@adobe.com</email> |
---|
85 | <uri>http://larry.masinter.net</uri> |
---|
86 | </address> |
---|
87 | </author> |
---|
88 | |
---|
89 | <date year="2011" month="March" day="14"/> |
---|
90 | <area>Applications</area> |
---|
91 | <workgroup>Internationalized Resource Identifiers (iri)</workgroup> |
---|
92 | <keyword>IRI</keyword> |
---|
93 | <keyword>Internationalized Resource Identifier</keyword> |
---|
94 | <keyword>UTF-8</keyword> |
---|
95 | <keyword>URI</keyword> |
---|
96 | <keyword>URL</keyword> |
---|
97 | <keyword>IDN</keyword> |
---|
98 | <keyword>LEIRI</keyword> |
---|
99 | |
---|
100 | <abstract> |
---|
101 | <t>This document defines the Internationalized Resource Identifier |
---|
102 | (IRI) protocol element, as an extension of the Uniform Resource |
---|
103 | Identifier (URI). An IRI is a sequence of characters from the |
---|
104 | Universal Character Set (Unicode/ISO 10646). Grammar and processing |
---|
105 | rules are given for IRIs and related syntactic forms.</t> |
---|
106 | |
---|
107 | <t>In addition, this document provides named additional rule sets |
---|
108 | for processing otherwise invalid IRIs, in a way that supports |
---|
109 | other specifications that wish to mandate common behavior for |
---|
110 | 'error' handling. In particular, rules used in some XML languages |
---|
111 | (LEIRI) and web applications are given.</t> |
---|
112 | |
---|
113 | <t>Defining IRI as new protocol element (rather than updating or |
---|
114 | extending the definition of URI) allows independent orderly |
---|
115 | transitions: other protocols and languages that use URIs must |
---|
116 | explicitly choose to allow IRIs.</t> |
---|
117 | |
---|
118 | <t>Guidelines are provided for the use and deployment of IRIs and |
---|
119 | related protocol elements when revising protocols, formats, and |
---|
120 | software components that currently deal only with URIs.</t> |
---|
121 | |
---|
122 | </abstract> |
---|
123 | <note title='RFC Editor: Please remove the next paragraph before publication.'> |
---|
124 | <t>This document is intended to update RFC 3987 and move towards IETF |
---|
125 | Draft Standard. For discussion and comments on this |
---|
126 | draft, please join the IETF IRI WG by subscribing to the mailing |
---|
127 | list public-iri@w3.org. For a list of open issues, please see |
---|
128 | the issue tracker of the WG at http://trac.tools.ietf.org/wg/iri/trac/report/1. |
---|
129 | For a list of individual edits, please see the change history at |
---|
130 | http://trac.tools.ietf.org/wg/iri/trac/log/draft-ietf-iri-3987bis.</t> |
---|
131 | </note> |
---|
132 | </front> |
---|
133 | <middle> |
---|
134 | |
---|
135 | <section title="Introduction"> |
---|
136 | |
---|
137 | <section title="Overview and Motivation" anchor="overview"> |
---|
138 | |
---|
139 | <t>A Uniform Resource Identifier (URI) is defined in <xref |
---|
140 | target="RFC3986"/> as a sequence of characters chosen from a limited |
---|
141 | subset of the repertoire of US-ASCII <xref target="ASCII"/> |
---|
142 | characters.</t> |
---|
143 | |
---|
144 | <t>The characters in URIs are frequently used for representing words |
---|
145 | of natural languages. This usage has many advantages: Such URIs are |
---|
146 | easier to memorize, easier to interpret, easier to transcribe, easier |
---|
147 | to create, and easier to guess. For most languages other than English, |
---|
148 | however, the natural script uses characters other than A - Z. For many |
---|
149 | people, handling Latin characters is as difficult as handling the |
---|
150 | characters of other scripts is for those who use only the Latin |
---|
151 | alphabet. Many languages with non-Latin scripts are transcribed with |
---|
152 | Latin letters. These transcriptions are now often used in URIs, but |
---|
153 | they introduce additional difficulties.</t> |
---|
154 | |
---|
155 | <t>The infrastructure for the appropriate handling of characters from |
---|
156 | additional scripts is now widely deployed in operating system and |
---|
157 | application software. Software that can handle a wide variety of |
---|
158 | scripts and languages at the same time is increasingly common. Also, |
---|
159 | an increasing number of protocols and formats can carry a wide range of |
---|
160 | characters.</t> |
---|
161 | |
---|
162 | <t>URIs are used both as a protocol element (for transmission and |
---|
163 | processing by software) and also a presentation element (for display |
---|
164 | and handling by people who read, interpret, coin, or guess them). The |
---|
165 | transition between these roles is more difficult and complex when |
---|
166 | dealing with the larger set of characters than allowed for URIs in |
---|
167 | <xref target="RFC3986"/>. </t> |
---|
168 | |
---|
169 | <t>This document defines the protocol element called Internationalized |
---|
170 | Resource Identifier (IRI), which allow applications of URIs to be |
---|
171 | extended to use resource identifiers that have a much wider repertoire |
---|
172 | of characters. It also provides corresponding "internationalized" |
---|
173 | versions of other constructs from <xref target="RFC3986"/>, such as |
---|
174 | URI references. The syntax of IRIs is defined in <xref |
---|
175 | target="syntax"/>. |
---|
176 | </t> |
---|
177 | |
---|
178 | <t>Using characters outside of A - Z in IRIs adds a number of |
---|
179 | difficulties. <xref target="Bidi"/> discusses the special case of |
---|
180 | bidirectional IRIs using characters from scripts written |
---|
181 | right-to-left. <xref target="equivalence"/> discusses various forms |
---|
182 | of equivalence between IRIs. <xref target="IRIuse"/> discusses the use |
---|
183 | of IRIs in different situations. <xref target="guidelines"/> gives |
---|
184 | additional informative guidelines. <xref target="security"/> |
---|
185 | discusses IRI-specific security considerations.</t> |
---|
186 | |
---|
187 | <t>When originally defining IRIs, several design alternatives were considered. |
---|
188 | Historically interested readers can find an overview in Appendix A of <xref target="RFC3987"/>. |
---|
189 | For some additional background on the design of URIs and IRIs, please also see |
---|
190 | <xref target="Gettys"/>.</t> |
---|
191 | </section> <!-- overview --> |
---|
192 | |
---|
193 | <section title="Applicability" anchor="Applicability"> |
---|
194 | |
---|
195 | <t>IRIs are designed to allow protocols and software that deal with |
---|
196 | URIs to be updated to handle IRIs. A "URI scheme" (as defined by <xref |
---|
197 | target="RFC3986"/> and registered through the IANA process defined in |
---|
198 | <xref target="RFC4395bis"/> also serves as an "IRI scheme". Processing of |
---|
199 | IRIs is accomplished by extending the URI syntax while retaining (and |
---|
200 | not expanding) the set of "reserved" characters, such that the syntax |
---|
201 | for any URI scheme may be uniformly extended to allow non-ASCII |
---|
202 | characters. In addition, following parsing of an IRI, it is possible |
---|
203 | to construct a corresponding URI by first encoding characters outside |
---|
204 | of the allowed URI range and then reassembling the components. |
---|
205 | </t> |
---|
206 | |
---|
207 | <t>Practical use of IRIs forms in place of URIs forms depends on the |
---|
208 | following conditions being met:</t> |
---|
209 | |
---|
210 | <t><list style="hanging"> |
---|
211 | |
---|
212 | <t hangText="a.">A protocol or format element MUST be explicitly designated to be |
---|
213 | able to carry IRIs. The intent is to avoid introducing IRIs into |
---|
214 | contexts that are not defined to accept them. For example, XML |
---|
215 | schema <xref target="XMLSchema"/> has an explicit type "anyURI" that |
---|
216 | includes IRIs and IRI references. Therefore, IRIs and IRI references |
---|
217 | can be in attributes and elements of type "anyURI". On the other |
---|
218 | hand, in the <xref target="RFC2616"/> definition of HTTP/1.1, the |
---|
219 | Request URI is defined as a URI, which means that direct use of IRIs |
---|
220 | is not allowed in HTTP requests.</t> |
---|
221 | |
---|
222 | <t hangText="b.">The protocol or format carrying the IRIs MUST have a |
---|
223 | mechanism to represent the wide range of characters used in IRIs, |
---|
224 | either natively or by some protocol- or format-specific escaping |
---|
225 | mechanism (for example, numeric character references in <xref |
---|
226 | target="XML1"/>).</t> |
---|
227 | |
---|
228 | <t hangText="c.">The URI scheme definition, if it explicitly allows a |
---|
229 | percent sign ("%") in any syntactic component, SHOULD define the |
---|
230 | interpretation of sequences of percent-encoded octets (using "%XX" |
---|
231 | hex octets) as octet from sequences of UTF-8 encoded strings; this |
---|
232 | is recommended in the guidelines for registering new schemes, <xref |
---|
233 | target="RFC4395bis"/>. For example, this is the practice for IMAP URLs |
---|
234 | <xref target="RFC2192"/>, POP URLs <xref target="RFC2384"/> and the |
---|
235 | URN syntax <xref target="RFC2141"/>). Note that use of |
---|
236 | percent-encoding may also be restricted in some situations, for |
---|
237 | example, URI schemes that disallow percent-encoding might still be |
---|
238 | used with a fragment identifier which is percent-encoded (e.g., |
---|
239 | <xref target="XPointer"/>). See <xref target="UTF8use"/> for further |
---|
240 | discussion.</t> |
---|
241 | </list></t> |
---|
242 | |
---|
243 | </section> <!-- applicability --> |
---|
244 | |
---|
245 | <section title="Definitions" anchor="sec-Definitions"> |
---|
246 | |
---|
247 | <t>The following definitions are used in this document; they follow the |
---|
248 | terms in <xref target="RFC2130"/>, <xref target="RFC2277"/>, and |
---|
249 | <xref target="ISO10646"/>.</t> |
---|
250 | <t><list style="hanging"> |
---|
251 | |
---|
252 | <t hangText="character:">A member of a set of elements used for the |
---|
253 | organization, control, or representation of data. For example, |
---|
254 | "LATIN CAPITAL LETTER A" names a character.</t> |
---|
255 | |
---|
256 | <t hangText="octet:">An ordered sequence of eight bits considered as a |
---|
257 | unit.</t> |
---|
258 | |
---|
259 | <t hangText="character repertoire:">A set of characters (set in the |
---|
260 | mathematical sense).</t> |
---|
261 | |
---|
262 | <t hangText="sequence of characters:">A sequence of characters (one |
---|
263 | after another).</t> |
---|
264 | |
---|
265 | <t hangText="sequence of octets:">A sequence of octets (one after |
---|
266 | another).</t> |
---|
267 | |
---|
268 | <t hangText="character encoding:">A method of representing a sequence |
---|
269 | of characters as a sequence of octets (maybe with variants). Also, |
---|
270 | a method of (unambiguously) converting a sequence of octets into a |
---|
271 | sequence of characters.</t> |
---|
272 | |
---|
273 | <t hangText="charset:">The name of a parameter or attribute used to |
---|
274 | identify a character encoding.</t> |
---|
275 | |
---|
276 | <t hangText="UCS:">Universal Character Set. The coded character set |
---|
277 | defined by ISO/IEC 10646 <xref target="ISO10646"/> and the Unicode |
---|
278 | Standard <xref target="UNIV6"/>.</t> |
---|
279 | |
---|
280 | <t hangText="IRI reference:">Denotes the common usage of an |
---|
281 | Internationalized Resource Identifier. An IRI reference may be |
---|
282 | absolute or relative. However, the "IRI" that results from such a |
---|
283 | reference only includes absolute IRIs; any relative IRI references |
---|
284 | are resolved to their absolute form. Note that in <xref |
---|
285 | target="RFC2396"/> URIs did not include fragment identifiers, but |
---|
286 | in <xref target="RFC3986"/> fragment identifiers are part of |
---|
287 | URIs.</t> |
---|
288 | |
---|
289 | <t hangText="URL:">The term "URL" was originally used <xref |
---|
290 | target="RFC1738"/> for roughly what is now called a "URI". Books, |
---|
291 | software and documentation often refers to URIs and IRIs using the |
---|
292 | "URL" term. Some usages restrict "URL" to those URIs which are not |
---|
293 | URNs. Because of the ambiguity of the term using the term "URL" is |
---|
294 | NOT RECOMMENDED in formal documents.</t> |
---|
295 | |
---|
296 | <t hangText="LEIRI (Legacy Extended IRI) processing:"> This term was used in |
---|
297 | various XML specifications to refer |
---|
298 | to strings that, although not valid IRIs, were acceptable input to |
---|
299 | the processing rules in <xref target="LEIRIspec" />.</t> |
---|
300 | |
---|
301 | <t hangText="(Web Address, Hypertext Reference, HREF):"> These terms have been |
---|
302 | added in this document for convenience, to allow other |
---|
303 | specifications to refer to those strings that, although not valid |
---|
304 | IRIs, are acceptable input to the processing rules in <xref |
---|
305 | target="webaddress"/>. This usage corresponds to the parsing rules |
---|
306 | of some popular web browsing applications. |
---|
307 | ISSUE: Need to find a good name/abbreviation for these.</t> |
---|
308 | |
---|
309 | <t hangText="running text:">Human text (paragraphs, sentences, |
---|
310 | phrases) with syntax according to orthographic conventions of a |
---|
311 | natural language, as opposed to syntax defined for ease of |
---|
312 | processing by machines (e.g., markup, programming languages).</t> |
---|
313 | |
---|
314 | <t hangText="protocol element:">Any portion of a message that affects |
---|
315 | processing of that message by the protocol in question.</t> |
---|
316 | |
---|
317 | <t hangText="presentation element:">A presentation form corresponding |
---|
318 | to a protocol element; for example, using a wider range of |
---|
319 | characters.</t> |
---|
320 | |
---|
321 | <t hangText="create (a URI or IRI):">With respect to URIs and IRIs, |
---|
322 | the term is used for the initial creation. This may be the |
---|
323 | initial creation of a resource with a certain identifier, or the |
---|
324 | initial exposition of a resource under a particular |
---|
325 | identifier.</t> |
---|
326 | |
---|
327 | <t hangText="generate (a URI or IRI):">With respect to URIs and IRIs, |
---|
328 | the term is used when the identifier is generated by derivation |
---|
329 | from other information.</t> |
---|
330 | |
---|
331 | <t hangText="parsed URI component:">When a URI processor parses a URI |
---|
332 | (following the generic syntax or a scheme-specific syntax, the result |
---|
333 | is a set of parsed URI components, each of which has a type |
---|
334 | (corresponding to the syntactic definition) and a sequence of URI |
---|
335 | characters. </t> |
---|
336 | |
---|
337 | <t hangText="parsed IRI component:">When an IRI processor parses |
---|
338 | an IRI directly, following the general syntax or a scheme-specific |
---|
339 | syntax, the result is a set of parsed IRI components, each of |
---|
340 | which has a type (corresponding to the syntactice definition) |
---|
341 | and a sequence of IRI characters. (This definition is analogous |
---|
342 | to "parsed URI component".)</t> |
---|
343 | |
---|
344 | <t hangText="IRI scheme:">A URI scheme may also be known as |
---|
345 | an "IRI scheme" if the scheme's syntax has been extended to |
---|
346 | allow non-US-ASCII characters according to the rules in this |
---|
347 | document.</t> |
---|
348 | |
---|
349 | </list></t> |
---|
350 | </section> <!-- definitions --> |
---|
351 | <section title="Notation" anchor="sec-Notation"> |
---|
352 | |
---|
353 | <t>RFCs and Internet Drafts currently do not allow any characters |
---|
354 | outside the US-ASCII repertoire. Therefore, this document uses various |
---|
355 | special notations to denote such characters in examples.</t> |
---|
356 | |
---|
357 | <t>In text, characters outside US-ASCII are sometimes referenced by |
---|
358 | using a prefix of 'U+', followed by four to six hexadecimal |
---|
359 | digits.</t> |
---|
360 | |
---|
361 | <t>To represent characters outside US-ASCII in examples, this document |
---|
362 | uses two notations: 'XML Notation' and 'Bidi Notation'.</t> |
---|
363 | |
---|
364 | <t>XML Notation uses a leading '&#x', a trailing ';', and the |
---|
365 | hexadecimal number of the character in the UCS in between. For |
---|
366 | example, &#x44F; stands for CYRILLIC CAPITAL LETTER YA. In this |
---|
367 | notation, an actual '&' is denoted by '&amp;'.</t> |
---|
368 | |
---|
369 | <t>Bidi Notation is used for bidirectional examples: Lower case |
---|
370 | letters stand for Latin letters or other letters that are written left |
---|
371 | to right, whereas upper case letters represent Arabic or Hebrew |
---|
372 | letters that are written right to left.</t> |
---|
373 | |
---|
374 | <t>To denote actual octets in examples (as opposed to percent-encoded |
---|
375 | octets), the two hex digits denoting the octet are enclosed in "<" |
---|
376 | and ">". For example, the octet often denoted as 0xc9 is denoted |
---|
377 | here as <c9>.</t> |
---|
378 | |
---|
379 | <t> In this document, the key words "MUST", "MUST NOT", "REQUIRED", |
---|
380 | "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", |
---|
381 | and "OPTIONAL" are to be interpreted as described in <xref |
---|
382 | target="RFC2119"/>.</t> |
---|
383 | |
---|
384 | </section> <!-- notation --> |
---|
385 | </section> <!-- introduction --> |
---|
386 | |
---|
387 | <section title="IRI Syntax" anchor="syntax"> |
---|
388 | <t>This section defines the syntax of Internationalized Resource |
---|
389 | Identifiers (IRIs).</t> |
---|
390 | |
---|
391 | <t>As with URIs, an IRI is defined as a sequence of characters, not as |
---|
392 | a sequence of octets. This definition accommodates the fact that IRIs |
---|
393 | may be written on paper or read over the radio as well as stored or |
---|
394 | transmitted digitally. The same IRI might be represented as different |
---|
395 | sequences of octets in different protocols or documents if these |
---|
396 | protocols or documents use different character encodings (and/or |
---|
397 | transfer encodings). Using the same character encoding as the |
---|
398 | containing protocol or document ensures that the characters in the IRI |
---|
399 | can be handled (e.g., searched, converted, displayed) in the same way |
---|
400 | as the rest of the protocol or document.</t> |
---|
401 | |
---|
402 | <section title="Summary of IRI Syntax" anchor="summary"> |
---|
403 | |
---|
404 | <t>IRIs are defined by extending the URI syntax in <xref |
---|
405 | target="RFC3986"/>, but extending the class of unreserved characters |
---|
406 | by adding the characters of the UCS (Universal Character Set, <xref |
---|
407 | target="ISO10646"/>) beyond U+007F, subject to the limitations given |
---|
408 | in the syntax rules below and in <xref target="limitations"/>.</t> |
---|
409 | |
---|
410 | <t>The syntax and use of components and reserved characters is the |
---|
411 | same as that in <xref target="RFC3986"/>. Each "URI scheme" thus also |
---|
412 | functions as an "IRI scheme", in that scheme-specific parsing rules |
---|
413 | for URIs of a scheme are be extended to allow parsing of IRIs using |
---|
414 | the same parsing rules.</t> |
---|
415 | |
---|
416 | <t>All the operations defined in <xref target="RFC3986"/>, such as the |
---|
417 | resolution of relative references, can be applied to IRIs by |
---|
418 | IRI-processing software in exactly the same way as they are for URIs |
---|
419 | by URI-processing software.</t> |
---|
420 | |
---|
421 | <t>Characters outside the US-ASCII repertoire MUST NOT be reserved and |
---|
422 | therefore MUST NOT be used for syntactical purposes, such as to |
---|
423 | delimit components in newly defined schemes. For example, U+00A2, CENT |
---|
424 | SIGN, is not allowed as a delimiter in IRIs, because it is in the |
---|
425 | 'iunreserved' category. This is similar to the fact that it is not |
---|
426 | possible to use '-' as a delimiter in URIs, because it is in the |
---|
427 | 'unreserved' category.</t> |
---|
428 | |
---|
429 | </section> <!-- summary --> |
---|
430 | <section title="ABNF for IRI References and IRIs" anchor="abnf"> |
---|
431 | |
---|
432 | <t>An ABNF definition for IRI references (which are the most general |
---|
433 | concept and the start of the grammar) and IRIs is given here. The |
---|
434 | syntax of this ABNF is described in <xref target="STD68"/>. Character |
---|
435 | numbers are taken from the UCS, without implying any actual binary |
---|
436 | encoding. Terminals in the ABNF are characters, not octets.</t> |
---|
437 | |
---|
438 | <t>The following grammar closely follows the URI grammar in <xref |
---|
439 | target="RFC3986"/>, except that the range of unreserved characters is |
---|
440 | expanded to include UCS characters, with the restriction that private |
---|
441 | UCS characters can occur only in query parts. The grammar is split |
---|
442 | into two parts: Rules that differ from <xref target="RFC3986"/> |
---|
443 | because of the above-mentioned expansion, and rules that are the same |
---|
444 | as those in <xref target="RFC3986"/>. For rules that are different |
---|
445 | than those in <xref target="RFC3986"/>, the names of the non-terminals |
---|
446 | have been changed as follows. If the non-terminal contains 'URI', this |
---|
447 | has been changed to 'IRI'. Otherwise, an 'i' has been prefixed.</t> |
---|
448 | |
---|
449 | <!-- |
---|
450 | for line length measuring in artwork (max 72 chars, three chars at start): |
---|
451 | 1 2 3 4 5 6 7 |
---|
452 | 456789012345678901234567890123456789012345678901234567890123456789012 |
---|
453 | --> |
---|
454 | <figure> |
---|
455 | <preamble>The following rules are different from those in <xref target="RFC3986"/>:</preamble> |
---|
456 | <artwork> |
---|
457 | IRI = scheme ":" ihier-part [ "?" iquery ] |
---|
458 | [ "#" ifragment ] |
---|
459 | |
---|
460 | ihier-part = "//" iauthority ipath-abempty |
---|
461 | / ipath-absolute |
---|
462 | / ipath-rootless |
---|
463 | / ipath-empty |
---|
464 | |
---|
465 | IRI-reference = IRI / irelative-ref |
---|
466 | |
---|
467 | absolute-IRI = scheme ":" ihier-part [ "?" iquery ] |
---|
468 | |
---|
469 | irelative-ref = irelative-part [ "?" iquery ] [ "#" ifragment ] |
---|
470 | |
---|
471 | irelative-part = "//" iauthority ipath-abempty |
---|
472 | / ipath-absolute |
---|
473 | / ipath-noscheme |
---|
474 | / ipath-empty |
---|
475 | |
---|
476 | iauthority = [ iuserinfo "@" ] ihost [ ":" port ] |
---|
477 | iuserinfo = *( iunreserved / pct-form / sub-delims / ":" ) |
---|
478 | ihost = IP-literal / IPv4address / ireg-name |
---|
479 | |
---|
480 | pct-form = pct-encoded |
---|
481 | |
---|
482 | ireg-name = *( iunreserved / sub-delims ) |
---|
483 | |
---|
484 | ipath = ipath-abempty ; begins with "/" or is empty |
---|
485 | / ipath-absolute ; begins with "/" but not "//" |
---|
486 | / ipath-noscheme ; begins with a non-colon segment |
---|
487 | / ipath-rootless ; begins with a segment |
---|
488 | / ipath-empty ; zero characters |
---|
489 | |
---|
490 | ipath-abempty = *( path-sep isegment ) |
---|
491 | ipath-absolute = path-sep [ isegment-nz *( path-sep isegment ) ] |
---|
492 | ipath-noscheme = isegment-nz-nc *( path-sep isegment ) |
---|
493 | ipath-rootless = isegment-nz *( path-sep isegment ) |
---|
494 | ipath-empty = 0<ipchar> |
---|
495 | path-sep = "/" |
---|
496 | |
---|
497 | isegment = *ipchar |
---|
498 | isegment-nz = 1*ipchar |
---|
499 | isegment-nz-nc = 1*( iunreserved / pct-form / sub-delims |
---|
500 | / "@" ) |
---|
501 | ; non-zero-length segment without any colon ":" |
---|
502 | |
---|
503 | ipchar = iunreserved / pct-form / sub-delims / ":" |
---|
504 | / "@" |
---|
505 | |
---|
506 | iquery = *( ipchar / iprivate / "/" / "?" ) |
---|
507 | |
---|
508 | ifragment = *( ipchar / "/" / "?" / "#" ) |
---|
509 | |
---|
510 | iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar |
---|
511 | |
---|
512 | ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF |
---|
513 | / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD |
---|
514 | / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD |
---|
515 | / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD |
---|
516 | / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD |
---|
517 | / %xD0000-DFFFD / %xE1000-EFFFD |
---|
518 | |
---|
519 | iprivate = %xE000-F8FF / %xE0000-E0FFF / %xF0000-FFFFD |
---|
520 | / %x100000-10FFFD |
---|
521 | </artwork> |
---|
522 | </figure> |
---|
523 | |
---|
524 | <t>Some productions are ambiguous. The "first-match-wins" (a.k.a. "greedy") |
---|
525 | algorithm applies. For details, see <xref target="RFC3986"/>.</t> |
---|
526 | |
---|
527 | <figure> |
---|
528 | <preamble>The following rules are the same as those in <xref target="RFC3986"/>:</preamble> |
---|
529 | <artwork> |
---|
530 | scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) |
---|
531 | |
---|
532 | port = *DIGIT |
---|
533 | |
---|
534 | IP-literal = "[" ( IPv6address / IPvFuture ) "]" |
---|
535 | |
---|
536 | IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) |
---|
537 | |
---|
538 | IPv6address = 6( h16 ":" ) ls32 |
---|
539 | / "::" 5( h16 ":" ) ls32 |
---|
540 | / [ h16 ] "::" 4( h16 ":" ) ls32 |
---|
541 | / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 |
---|
542 | / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 |
---|
543 | / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 |
---|
544 | / [ *4( h16 ":" ) h16 ] "::" ls32 |
---|
545 | / [ *5( h16 ":" ) h16 ] "::" h16 |
---|
546 | / [ *6( h16 ":" ) h16 ] "::" |
---|
547 | |
---|
548 | h16 = 1*4HEXDIG |
---|
549 | ls32 = ( h16 ":" h16 ) / IPv4address |
---|
550 | |
---|
551 | IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet |
---|
552 | |
---|
553 | dec-octet = DIGIT ; 0-9 |
---|
554 | / %x31-39 DIGIT ; 10-99 |
---|
555 | / "1" 2DIGIT ; 100-199 |
---|
556 | / "2" %x30-34 DIGIT ; 200-249 |
---|
557 | / "25" %x30-35 ; 250-255 |
---|
558 | |
---|
559 | pct-encoded = "%" HEXDIG HEXDIG |
---|
560 | |
---|
561 | unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
---|
562 | reserved = gen-delims / sub-delims |
---|
563 | gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
---|
564 | sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
---|
565 | / "*" / "+" / "," / ";" / "=" |
---|
566 | </artwork></figure> |
---|
567 | |
---|
568 | <t>This syntax does not support IPv6 scoped addressing zone identifiers.</t> |
---|
569 | |
---|
570 | </section> <!-- abnf --> |
---|
571 | |
---|
572 | </section> <!-- syntax --> |
---|
573 | |
---|
574 | <section title="Processing IRIs and related protocol elements" anchor="processing"> |
---|
575 | |
---|
576 | <t>IRIs are meant to replace URIs in identifying resources within new |
---|
577 | versions of protocols, formats, and software components that use a |
---|
578 | UCS-based character repertoire. Protocols and components may use and |
---|
579 | process IRIs directly. However, there are still numerous systems and |
---|
580 | protocols which only accept URIs or components of parsed URIs; that is, |
---|
581 | they only accept sequences of characters within the subset of US-ASCII |
---|
582 | characters allowed in URIs. </t> |
---|
583 | |
---|
584 | <t>This section defines specific processing steps for IRI consumers |
---|
585 | which establish the relationship between the string given and the |
---|
586 | interpreted derivatives. These |
---|
587 | processing steps apply to both IRIs and IRI references (i.e., absolute |
---|
588 | or relative forms); for IRIs, some steps are scheme specific. </t> |
---|
589 | |
---|
590 | <section title="Converting to UCS" anchor="ucsconv"> |
---|
591 | |
---|
592 | <t>Input that is already in a Unicode form (i.e., a sequence of Unicode |
---|
593 | characters or an octet-stream representing a Unicode-based character |
---|
594 | encoding such as UTF-8 or UTF-16) should be left as is and not |
---|
595 | normalized (see (see <xref target="normalization"/>).</t> |
---|
596 | |
---|
597 | <t>An IRI or IRI reference is a sequence of characters from the UCS. |
---|
598 | For IRIs that are not already in a Unicode form |
---|
599 | (as when written on paper, read aloud, or represented in a text stream |
---|
600 | using a legacy character encoding), convert the IRI to Unicode. |
---|
601 | Note that some character encodings or transcriptions can be converted |
---|
602 | to or represented by more than one sequence of Unicode characters. |
---|
603 | Ideally the resulting IRI would use a normalized form, |
---|
604 | such as Unicode Normalization Form C <xref target="UTR15"/> |
---|
605 | (see <xref target='ladder'/> Normalization and Comparison), |
---|
606 | since that ensures a stable, consistent representation |
---|
607 | that is most likely to produce the intended results. |
---|
608 | Implementers and users are cautioned that, while denormalized character sequences are valid, |
---|
609 | they might be difficult for other users or processes to reproduce |
---|
610 | and might lead to unexpected results. |
---|
611 | </t> |
---|
612 | |
---|
613 | <t> In other cases (written on paper, read aloud, or otherwise |
---|
614 | represented independent of any character encoding) represent the IRI |
---|
615 | as a sequence of characters from the UCS normalized according to |
---|
616 | Unicode Normalization Form C (NFC, <xref target="UTR15"/>).</t> |
---|
617 | </section> <!-- ucsconv --> |
---|
618 | |
---|
619 | <section title="Parse the IRI into IRI components"> |
---|
620 | |
---|
621 | <t>Parse the IRI, either as a relative reference (no scheme) |
---|
622 | or using scheme specific processing (according to the scheme |
---|
623 | given); the result resulting in a set of parsed IRI components. |
---|
624 | (NOTE: FIX BEFORE RELEASE: INTENT IS THAT ALL IRI SCHEMES |
---|
625 | THAT USE GENERIC SYNTAX AND ALLOW NON-ASCII AUTHORITY CAN |
---|
626 | ONLY USE AUTHORITY FOR NAMES THAT FOLLOW PUNICODE.) |
---|
627 | </t> |
---|
628 | |
---|
629 | <t>NOTE: The result of parsing into components will correspond result |
---|
630 | in a correspondence of subtrings of the IRI according to the part |
---|
631 | matched. For example, in <xref target="HTML5"/>, the protocol |
---|
632 | components of interest are SCHEME (scheme), HOST (ireg-name), PORT |
---|
633 | (port), the PATH (ipath after the initial "/"), QUERY (iquery), |
---|
634 | FRAGMENT (ifragment), and AUTHORITY (iauthority). |
---|
635 | </t> |
---|
636 | |
---|
637 | <t>Subsequent processing rules are sometimes used to define other |
---|
638 | syntactic components. For example, <xref target="HTML5"/> defines APIs |
---|
639 | for IRI processing; in these APIs: |
---|
640 | |
---|
641 | <list style="hanging"> |
---|
642 | <t hangText="HOSTSPECIFIC"> the substring that follows |
---|
643 | the substring matched by the iauthority production, or the whole |
---|
644 | string if the iauthority production wasn't matched.</t> |
---|
645 | <t hangText="HOSTPORT"> if there is a scheme component and a port |
---|
646 | component and the port given by the port component is different than |
---|
647 | the default port defined for the protocol given by the scheme |
---|
648 | component, then HOSTPORT is the substring that starts with the |
---|
649 | substring matched by the host production and ends with the substring |
---|
650 | matched by the port production, and includes the colon in between the |
---|
651 | two. Otherwise, it is the same as the host component. |
---|
652 | </t> |
---|
653 | </list> |
---|
654 | </t> |
---|
655 | </section> <!-- parse --> |
---|
656 | |
---|
657 | <section title="General percent-encoding of IRI components" anchor="compmapping"> |
---|
658 | |
---|
659 | <t>For most IRI components, it is possible to map the IRI component |
---|
660 | to an equivalent URI component by percent-encoding those characters |
---|
661 | not allowed in URIs. Previous processing steps will have removed |
---|
662 | some characters, and the interpretation of reserved characters will |
---|
663 | have already been done (with the syntactic reserved characters outside |
---|
664 | of the IRI component). This mapping is defined for all sequences |
---|
665 | of Unicode characters, whether or not they are valid for the component |
---|
666 | in question. </t> |
---|
667 | |
---|
668 | <t>For each character which is not allowed in a valid URI (NOTE: WHAT |
---|
669 | IS THE RIGHT REFERENCE HERE), apply the following steps. </t> |
---|
670 | |
---|
671 | <t><list style="hanging"> |
---|
672 | |
---|
673 | <t hangText="Convert to UTF-8">Convert the character to a sequence of |
---|
674 | one or more octets using UTF-8 <xref target="RFC3629"/>.</t> |
---|
675 | |
---|
676 | <t hangText="Percent encode">Convert each octet of this sequence to %HH, |
---|
677 | where HH is the hexadecimal notation of the octet value. The |
---|
678 | hexadecimal notation SHOULD use uppercase letters. (This is the |
---|
679 | general URI percent-encoding mechanism in Section 2.1 of <xref |
---|
680 | target="RFC3986"/>.)</t> |
---|
681 | |
---|
682 | </list></t> |
---|
683 | |
---|
684 | <t>Note that the mapping is an identity transformation for parsed URI |
---|
685 | components of valid URIs, and is idempotent: applying the mapping a |
---|
686 | second time will not change anything.</t> |
---|
687 | </section> <!-- general conversion --> |
---|
688 | |
---|
689 | <section title="Mapping ireg-name" anchor="dnsmapping"> |
---|
690 | |
---|
691 | <t>Schemes that allow non-ASCII based characters |
---|
692 | in the reg-name (ireg-name) position MUST convert the ireg-name |
---|
693 | component of an IRI as follows:</t> |
---|
694 | |
---|
695 | <t>Replace the ireg-name part of the IRI by the part converted using |
---|
696 | the ToASCII operation specified in Section 4.1 of <xref |
---|
697 | target="RFC3490"/> on each dot-separated label, and by using U+002E |
---|
698 | (FULL STOP) as a label separator, with the flag UseSTD3ASCIIRules set |
---|
699 | to FALSE, and with the flag AllowUnassigned set to FALSE. |
---|
700 | The ToASCII operation may |
---|
701 | fail, but this would mean that the IRI cannot be resolved. |
---|
702 | In such cases, if the domain name conversion fails, then the |
---|
703 | entire IRI conversion fails. Processors that have no mechanism for |
---|
704 | signalling a failure MAY instead substitute an otherwise |
---|
705 | invalid host name, although such processing SHOULD be avoided. |
---|
706 | </t> |
---|
707 | |
---|
708 | <t>For example, the IRI |
---|
709 | <vspace/>"http://r&#xE9;sum&#xE9;.example.org"<vspace/> MAY be |
---|
710 | converted to <vspace/>"http://xn--rsum-bad.example.org"<vspace/>; |
---|
711 | conversion to percent-encoded form, e.g., |
---|
712 | <vspace/>"http://r%C3%A9sum%C3%A9.example.org", MUST NOT be performed. </t> |
---|
713 | |
---|
714 | <t><list style="hanging"> |
---|
715 | |
---|
716 | <t hangText="Note:">Domain Names may appear in parts of an IRI other |
---|
717 | than the ireg-name part. It is the responsibility of scheme-specific |
---|
718 | implementations (if the Internationalized Domain Name is part of the |
---|
719 | scheme syntax) or of server-side implementations (if the |
---|
720 | Internationalized Domain Name is part of 'iquery') to apply the |
---|
721 | necessary conversions at the appropriate point. Example: Trying to |
---|
722 | validate the Web page at<vspace/> |
---|
723 | http://r&#xE9;sum&#xE9;.example.org would lead to an IRI of |
---|
724 | <vspace/>http://validator.w3.org/check?uri=http%3A%2F%2Fr&#xE9;sum&#xE9;.<vspace/>example.org, |
---|
725 | which would convert to a URI |
---|
726 | of<vspace/>http://validator.w3.org/check?uri=http%3A%2F%2Fr%C3%A9sum%C3%A9.<vspace/>example.org. |
---|
727 | The server-side implementation is responsible for making the |
---|
728 | necessary conversions to be able to retrieve the Web page.</t> |
---|
729 | |
---|
730 | <t hangText="Note:">In this process, characters allowed in URI |
---|
731 | references and existing percent-encoded sequences are not encoded further. |
---|
732 | (This mapping is similar to, but different from, the encoding applied |
---|
733 | when arbitrary content is included in some part of a URI.) |
---|
734 | |
---|
735 | For example, an IRI of |
---|
736 | <vspace/>"http://www.example.org/red%09ros&#xE9;#red" |
---|
737 | (in XML notation) is converted to |
---|
738 | <vspace/>"http://www.example.org/red%09ros%C3%A9#red", not to |
---|
739 | something like |
---|
740 | <vspace/>"http%3A%2F%2Fwww.example.org%2Fred%2509ros%C3%A9%23red". |
---|
741 | ((DESIGN QUESTION: What about e.g. http://r%C3%A9sum%C3%A9.example.org in an IRI? Will that get converted to punycode, or not?)) |
---|
742 | |
---|
743 | </t> |
---|
744 | |
---|
745 | </list></t> |
---|
746 | </section> <!-- dnsmapping --> |
---|
747 | |
---|
748 | <section title="Mapping query components" anchor="querymapping"> |
---|
749 | |
---|
750 | <t>((NOTE: SEE ISSUES LIST)) |
---|
751 | |
---|
752 | For compatibility with existing deployed HTTP infrastructure, |
---|
753 | the following special case applies for schemes "http" and "https" |
---|
754 | and IRIs whose origin has a document charset other than one which |
---|
755 | is UCS-based (e.g., UTF-8 or UTF-16). In such a case, the "query" |
---|
756 | component of an IRI is mapped into a URI by using the document |
---|
757 | charset rather than UTF-8 as the binary representation before |
---|
758 | pct-encoding. This mapping is not applied for any other scheme |
---|
759 | or component.</t> |
---|
760 | |
---|
761 | </section> <!-- querymapping --> |
---|
762 | |
---|
763 | <section title="Mapping IRIs to URIs" anchor="mapping"> |
---|
764 | |
---|
765 | <t>The canonical mapping from a IRI to URI is defined by applying the |
---|
766 | mapping above (from IRI to URI components) and then reassembling a URI |
---|
767 | from the parsed URI components using the original punctuation that |
---|
768 | delimited the IRI components. </t> |
---|
769 | |
---|
770 | </section> <!-- mapping --> |
---|
771 | |
---|
772 | <section title="Converting URIs to IRIs" anchor="URItoIRI"> |
---|
773 | |
---|
774 | <t>In some situations, for presentation and further processing, |
---|
775 | it is desirable to convert a URI into an equivalent IRI in which |
---|
776 | natural characters are represented directly rather than |
---|
777 | percent encoded. Of course, every URI is already an IRI in |
---|
778 | its own right without any conversion, and in general there |
---|
779 | This section gives one such procedure for this conversion. |
---|
780 | </t> |
---|
781 | |
---|
782 | <t> |
---|
783 | The conversion described in this section, if given a valid URI, will |
---|
784 | result in an IRI that maps back to the URI used as an input for the |
---|
785 | conversion (except for potential case differences in percent-encoding |
---|
786 | and for potential percent-encoded unreserved characters). |
---|
787 | |
---|
788 | However, the IRI resulting from this conversion may differ |
---|
789 | from the original IRI (if there ever was one).</t> |
---|
790 | |
---|
791 | <t>URI-to-IRI conversion removes percent-encodings, but not all |
---|
792 | percent-encodings can be eliminated. There are several reasons for |
---|
793 | this:</t> |
---|
794 | |
---|
795 | <t><list style="hanging"> |
---|
796 | |
---|
797 | <t hangText="1.">Some percent-encodings are necessary to distinguish |
---|
798 | percent-encoded and unencoded uses of reserved characters.</t> |
---|
799 | |
---|
800 | <t hangText="2.">Some percent-encodings cannot be interpreted as sequences |
---|
801 | of UTF-8 octets.<vspace blankLines="1"/> |
---|
802 | (Note: The octet patterns of UTF-8 are highly regular. |
---|
803 | Therefore, there is a very high probability, but no guarantee, |
---|
804 | that percent-encodings that can be interpreted as sequences of UTF-8 |
---|
805 | octets actually originated from UTF-8. For a detailed discussion, |
---|
806 | see <xref target="Duerst97"/>.)</t> |
---|
807 | |
---|
808 | <t hangText="3.">The conversion may result in a character that is not |
---|
809 | appropriate in an IRI. See <xref target="abnf"/>, <xref target="visual"/>, |
---|
810 | and <xref target="limitations"/> for further details.</t> |
---|
811 | |
---|
812 | <t hangText="4.">IRI to URI conversion has different rules for |
---|
813 | dealing with domain names and query parameters.</t> |
---|
814 | |
---|
815 | </list></t> |
---|
816 | |
---|
817 | <t>Conversion from a URI to an IRI MAY be done by using the following |
---|
818 | steps: |
---|
819 | |
---|
820 | <list style="hanging"> |
---|
821 | <t hangText="1.">Represent the URI as a sequence of octets in |
---|
822 | US-ASCII.</t> |
---|
823 | |
---|
824 | <t hangText="2.">Convert all percent-encodings ("%" followed by two |
---|
825 | hexadecimal digits) to the corresponding octets, except those |
---|
826 | corresponding to "%", characters in "reserved", and characters |
---|
827 | in US-ASCII not allowed in URIs.</t> |
---|
828 | |
---|
829 | <t hangText="3.">Re-percent-encode any octet produced in step 2 that |
---|
830 | is not part of a strictly legal UTF-8 octet sequence.</t> |
---|
831 | |
---|
832 | |
---|
833 | <t hangText="4.">Re-percent-encode all octets produced in step 3 that |
---|
834 | in UTF-8 represent characters that are not appropriate according |
---|
835 | to <xref target="abnf"/>, <xref target="visual"/>, and <xref |
---|
836 | target="limitations"/>.</t> |
---|
837 | |
---|
838 | <t hangText="5.">Interpret the resulting octet sequence as a sequence |
---|
839 | of characters encoded in UTF-8.</t> |
---|
840 | |
---|
841 | <t hangText="6.">URIs known to contain domain names in the reg-name |
---|
842 | component SHOULD convert punycode-encoded domain name labels to |
---|
843 | the corresponding characters using the ToUnicode procedure. </t> |
---|
844 | </list></t> |
---|
845 | |
---|
846 | <t>This procedure will convert as many percent-encoded characters as |
---|
847 | possible to characters in an IRI. Because there are some choices when |
---|
848 | step 4 is applied (see <xref target="limitations"/>), results may |
---|
849 | vary.</t> |
---|
850 | |
---|
851 | <t>Conversions from URIs to IRIs MUST NOT use any character |
---|
852 | encoding other than UTF-8 in steps 3 and 4, even if it might be |
---|
853 | possible to guess from the context that another character encoding |
---|
854 | than UTF-8 was used in the URI. For example, the URI |
---|
855 | "http://www.example.org/r%E9sum%E9.html" might with some guessing be |
---|
856 | interpreted to contain two e-acute characters encoded as |
---|
857 | iso-8859-1. It must not be converted to an IRI containing these |
---|
858 | e-acute characters. Otherwise, in the future the IRI will be mapped to |
---|
859 | "http://www.example.org/r%C3%A9sum%C3%A9.html", which is a different |
---|
860 | URI from "http://www.example.org/r%E9sum%E9.html".</t> |
---|
861 | |
---|
862 | <section title="Examples"> |
---|
863 | |
---|
864 | <t>This section shows various examples of converting URIs to IRIs. |
---|
865 | Each example shows the result after each of the steps 1 through 6 is |
---|
866 | applied. XML Notation is used for the final result. Octets are |
---|
867 | denoted by "<" followed by two hexadecimal digits followed by |
---|
868 | ">".</t> |
---|
869 | |
---|
870 | <t>The following example contains the sequence "%C3%BC", which is a |
---|
871 | strictly legal UTF-8 sequence, and which is converted into the actual |
---|
872 | character U+00FC, LATIN SMALL LETTER U WITH DIAERESIS (also known as |
---|
873 | u-umlaut). |
---|
874 | |
---|
875 | <list style="hanging"> |
---|
876 | <t hangText="1.">http://www.example.org/D%C3%BCrst</t> |
---|
877 | <t hangText="2.">http://www.example.org/D<c3><bc>rst</t> |
---|
878 | <t hangText="3.">http://www.example.org/D<c3><bc>rst</t> |
---|
879 | <t hangText="4.">http://www.example.org/D<c3><bc>rst</t> |
---|
880 | <t hangText="5.">http://www.example.org/D&#xFC;rst</t> |
---|
881 | <t hangText="6.">http://www.example.org/D&#xFC;rst</t> |
---|
882 | </list> |
---|
883 | </t> |
---|
884 | |
---|
885 | <t>The following example contains the sequence "%FC", which might |
---|
886 | represent U+00FC, LATIN SMALL LETTER U WITH DIAERESIS, in |
---|
887 | the<vspace/>iso-8859-1 character encoding. (It might represent other |
---|
888 | characters in other character encodings. For example, the octet |
---|
889 | <fc> in iso-8859-5 represents U+045C, CYRILLIC SMALL LETTER |
---|
890 | KJE.) Because <fc> is not part of a strictly legal UTF-8 |
---|
891 | sequence, it is re-percent-encoded in step 3. |
---|
892 | |
---|
893 | |
---|
894 | <list style="hanging"> |
---|
895 | <t hangText="1.">http://www.example.org/D%FCrst</t> |
---|
896 | <t hangText="2.">http://www.example.org/D<fc>rst</t> |
---|
897 | <t hangText="3.">http://www.example.org/D%FCrst</t> |
---|
898 | <t hangText="4.">http://www.example.org/D%FCrst</t> |
---|
899 | <t hangText="5.">http://www.example.org/D%FCrst</t> |
---|
900 | <t hangText="6.">http://www.example.org/D%FCrst</t> |
---|
901 | </list> |
---|
902 | </t> |
---|
903 | |
---|
904 | <t>The following example contains "%e2%80%ae", which is the percent-encoded<vspace/>UTF-8 |
---|
905 | character encoding of U+202E, RIGHT-TO-LEFT OVERRIDE. <xref target="visual"/> |
---|
906 | forbids the direct use of this character in an IRI. Therefore, the |
---|
907 | corresponding octets are re-percent-encoded in step 4. This example shows |
---|
908 | that the case (upper- or lowercase) of letters used in percent-encodings may not be preserved. |
---|
909 | The example also contains a punycode-encoded domain name label (xn--99zt52a), |
---|
910 | which is not converted. |
---|
911 | |
---|
912 | <list style="hanging"> |
---|
913 | <t hangText="1.">http://xn--99zt52a.example.org/%e2%80%ae</t> |
---|
914 | <t hangText="2.">http://xn--99zt52a.example.org/<e2><80><ae></t> |
---|
915 | <t hangText="3.">http://xn--99zt52a.example.org/<e2><80><ae></t> |
---|
916 | <t hangText="4.">http://xn--99zt52a.example.org/%E2%80%AE</t> |
---|
917 | <t hangText="5.">http://xn--99zt52a.example.org/%E2%80%AE</t> |
---|
918 | <t hangText="6.">http://&#x7D0D;&#x8C46;.example.org/%E2%80%AE</t> |
---|
919 | </list></t> |
---|
920 | |
---|
921 | <t>Note that the label "xn--99zt52a" is converted to U+7D0D U+8C46 |
---|
922 | (Japanese Natto). ((EDITOR NOTE: There is some inconsistency in this note.))</t> |
---|
923 | |
---|
924 | </section> <!-- examples --> |
---|
925 | </section> <!-- URItoIRI --> |
---|
926 | </section> <!-- processing --> |
---|
927 | <section title="Bidirectional IRIs for Right-to-Left Languages" anchor="Bidi"> |
---|
928 | |
---|
929 | <t>Some UCS characters, such as those used in the Arabic and Hebrew |
---|
930 | scripts, have an inherent right-to-left (rtl) writing direction. IRIs |
---|
931 | containing these characters (called bidirectional IRIs or Bidi IRIs) |
---|
932 | require additional attention because of the non-trivial relation |
---|
933 | between logical representation (used for digital representation and |
---|
934 | for reading/spelling) and visual representation (used for |
---|
935 | display/printing).</t> |
---|
936 | |
---|
937 | <t>Because of the complex interaction between the logical representation, |
---|
938 | the visual representation, and the syntax of a Bidi IRI, a balance is |
---|
939 | needed between various requirements. |
---|
940 | The main requirements are<list style="hanging"> |
---|
941 | <t hangText="1.">user-predictable conversion between visual and |
---|
942 | logical representation;</t> |
---|
943 | <t hangText="2.">the ability to include a wide range of characters |
---|
944 | in various parts of the IRI; and</t> |
---|
945 | <t hangText="3.">minor or no changes or restrictions for |
---|
946 | implementations.</t> |
---|
947 | </list></t> |
---|
948 | |
---|
949 | <section title="Logical Storage and Visual Presentation" anchor="visual"> |
---|
950 | |
---|
951 | <t>When stored or transmitted in digital representation, bidirectional |
---|
952 | IRIs MUST be in full logical order and MUST conform to the IRI syntax |
---|
953 | rules (which includes the rules relevant to their scheme). This |
---|
954 | ensures that bidirectional IRIs can be processed in the same way as |
---|
955 | other IRIs.</t> <t>Bidirectional IRIs MUST be rendered by using the |
---|
956 | Unicode Bidirectional Algorithm <xref target="UNIV6"/>, <xref |
---|
957 | target="UNI9"/>. Bidirectional IRIs MUST be rendered in the same way |
---|
958 | as they would be if they were in a left-to-right embedding; i.e., as |
---|
959 | if they were preceded by U+202A, LEFT-TO-RIGHT EMBEDDING (LRE), and |
---|
960 | followed by U+202C, POP DIRECTIONAL FORMATTING (PDF). Setting the |
---|
961 | embedding direction can also be done in a higher-level protocol (e.g., |
---|
962 | the dir='ltr' attribute in HTML).</t> |
---|
963 | |
---|
964 | <t>There is no requirement to use the above embedding if the display |
---|
965 | is still the same without the embedding. For example, a bidirectional |
---|
966 | IRI in a text with left-to-right base directionality (such as used for |
---|
967 | English or Cyrillic) that is preceded and followed by whitespace and |
---|
968 | strong left-to-right characters does not need an embedding. Also, a |
---|
969 | bidirectional relative IRI reference that only contains strong |
---|
970 | right-to-left characters and weak characters and that starts and ends |
---|
971 | with a strong right-to-left character and appears in a text with |
---|
972 | right-to-left base directionality (such as used for Arabic or Hebrew) |
---|
973 | and is preceded and followed by whitespace and strong characters does |
---|
974 | not need an embedding.</t> |
---|
975 | |
---|
976 | <t>In some other cases, using U+200E, LEFT-TO-RIGHT MARK (LRM), may be |
---|
977 | sufficient to force the correct display behavior. However, the |
---|
978 | details of the Unicode Bidirectional algorithm are not always easy to |
---|
979 | understand. Implementers are strongly advised to err on the side of |
---|
980 | caution and to use embedding in all cases where they are not |
---|
981 | completely sure that the display behavior is unaffected without the |
---|
982 | embedding.</t> |
---|
983 | |
---|
984 | <t>The Unicode Bidirectional Algorithm (<xref target="UNI9"/>, section |
---|
985 | 4.3) permits higher-level protocols to influence bidirectional |
---|
986 | rendering. Such changes by higher-level protocols MUST NOT be used if |
---|
987 | they change the rendering of IRIs.</t> |
---|
988 | |
---|
989 | <t>The bidirectional formatting characters that may be used before or |
---|
990 | after the IRI to ensure correct display are not themselves part of the |
---|
991 | IRI. IRIs MUST NOT contain bidirectional formatting characters (LRM, |
---|
992 | RLM, LRE, RLE, LRO, RLO, and PDF). They affect the visual rendering of |
---|
993 | the IRI but do not appear themselves. It would therefore not be |
---|
994 | possible to input an IRI with such characters correctly.</t> |
---|
995 | |
---|
996 | </section> <!-- visual --> |
---|
997 | <section title="Bidi IRI Structure" anchor="bidi-structure"> |
---|
998 | |
---|
999 | <t>The Unicode Bidirectional Algorithm is designed mainly for running |
---|
1000 | text. To make sure that it does not affect the rendering of |
---|
1001 | bidirectional IRIs too much, some restrictions on bidirectional IRIs |
---|
1002 | are necessary. These restrictions are given in terms of delimiters |
---|
1003 | (structural characters, mostly punctuation such as "@", ".", ":", |
---|
1004 | and<vspace/>"/") and components (usually consisting mostly of letters |
---|
1005 | and digits).</t> |
---|
1006 | |
---|
1007 | <t>The following syntax rules from <xref target="abnf"/> correspond to |
---|
1008 | components for the purpose of Bidi behavior: iuserinfo, ireg-name, |
---|
1009 | isegment, isegment-nz, isegment-nz-nc, ireg-name, iquery, and |
---|
1010 | ifragment.</t> |
---|
1011 | |
---|
1012 | <t>Specifications that define the syntax of any of the above |
---|
1013 | components MAY divide them further and define smaller parts to be |
---|
1014 | components according to this document. As an example, the restrictions |
---|
1015 | of <xref target="RFC3490"/> on bidirectional domain names correspond |
---|
1016 | to treating each label of a domain name as a component for schemes |
---|
1017 | with ireg-name as a domain name. Even where the components are not |
---|
1018 | defined formally, it may be helpful to think about some syntax in |
---|
1019 | terms of components and to apply the relevant restrictions. For |
---|
1020 | example, for the usual name/value syntax in query parts, it is |
---|
1021 | convenient to treat each name and each value as a component. As |
---|
1022 | another example, the extensions in a resource name can be treated as |
---|
1023 | separate components.</t> |
---|
1024 | |
---|
1025 | <t>For each component, the following restrictions apply:</t> |
---|
1026 | <t> |
---|
1027 | <list style="hanging"> |
---|
1028 | |
---|
1029 | <t hangText="1.">A component SHOULD NOT use both right-to-left and |
---|
1030 | left-to-right characters.</t> |
---|
1031 | |
---|
1032 | <t hangText="2.">A component using right-to-left characters SHOULD |
---|
1033 | start and end with right-to-left characters.</t> |
---|
1034 | |
---|
1035 | </list></t> |
---|
1036 | |
---|
1037 | <t>The above restrictions are given as "SHOULD"s, rather than as |
---|
1038 | "MUST"s. For IRIs that are never presented visually, they are not |
---|
1039 | relevant. However, for IRIs in general, they are very important to |
---|
1040 | ensure consistent conversion between visual presentation and logical |
---|
1041 | representation, in both directions.</t> |
---|
1042 | |
---|
1043 | <t><list style="hanging"> |
---|
1044 | |
---|
1045 | <t hangText="Note:">In some components, the above restrictions may |
---|
1046 | actually be strictly enforced. For example, <xref |
---|
1047 | target="RFC3490"></xref> requires that these restrictions apply to |
---|
1048 | the labels of a host name for those schemes where ireg-name is a |
---|
1049 | host name. In some other components (for example, path components) |
---|
1050 | following these restrictions may not be too difficult. For other |
---|
1051 | components, such as parts of the query part, it may be very |
---|
1052 | difficult to enforce the restrictions because the values of query |
---|
1053 | parameters may be arbitrary character sequences.</t> |
---|
1054 | |
---|
1055 | </list></t> |
---|
1056 | |
---|
1057 | <t>If the above restrictions cannot be satisfied otherwise, the |
---|
1058 | affected component can always be mapped to URI notation as described |
---|
1059 | in <xref target="compmapping"/>. Please note that the whole component |
---|
1060 | has to be mapped (see also Example 9 below).</t> |
---|
1061 | |
---|
1062 | </section> <!-- bidi-structure --> |
---|
1063 | |
---|
1064 | <section title="Input of Bidi IRIs" anchor="bidiInput"> |
---|
1065 | |
---|
1066 | <t>Bidi input methods MUST generate Bidi IRIs in logical order while |
---|
1067 | rendering them according to <xref target="visual"/>. During input, |
---|
1068 | rendering SHOULD be updated after every new character is input to |
---|
1069 | avoid end-user confusion.</t> |
---|
1070 | |
---|
1071 | </section> <!-- bidiInput --> |
---|
1072 | |
---|
1073 | <section title="Examples"> |
---|
1074 | |
---|
1075 | <t>This section gives examples of bidirectional IRIs, in Bidi |
---|
1076 | Notation. It shows legal IRIs with the relationship between logical |
---|
1077 | and visual representation and explains how certain phenomena in this |
---|
1078 | relationship may look strange to somebody not familiar with |
---|
1079 | bidirectional behavior, but familiar to users of Arabic and Hebrew. It |
---|
1080 | also shows what happens if the restrictions given in <xref |
---|
1081 | target="bidi-structure"/> are not followed. The examples below can be |
---|
1082 | seen at <xref target="BidiEx"/>, in Arabic, Hebrew, and Bidi Notation |
---|
1083 | variants.</t> |
---|
1084 | |
---|
1085 | <t>To read the bidi text in the examples, read the visual |
---|
1086 | representation from left to right until you encounter a block of rtl |
---|
1087 | text. Read the rtl block (including slashes and other special |
---|
1088 | characters) from right to left, then continue at the next unread ltr |
---|
1089 | character.</t> |
---|
1090 | |
---|
1091 | <t>Example 1: A single component with rtl characters is inverted: |
---|
1092 | <vspace/>Logical representation: |
---|
1093 | "http://ab.CDEFGH.ij/kl/mn/op.html"<vspace/>Visual representation: |
---|
1094 | "http://ab.HGFEDC.ij/kl/mn/op.html"<vspace/> Components can be read |
---|
1095 | one by one, and each component can be read in its natural |
---|
1096 | direction.</t> |
---|
1097 | |
---|
1098 | <t>Example 2: More than one consecutive component with rtl characters |
---|
1099 | is inverted as a whole: <vspace/>Logical representation: |
---|
1100 | "http://ab.CDE.FGH/ij/kl/mn/op.html"<vspace/>Visual representation: |
---|
1101 | "http://ab.HGF.EDC/ij/kl/mn/op.html"<vspace/> A sequence of rtl |
---|
1102 | components is read rtl, in the same way as a sequence of rtl words is |
---|
1103 | read rtl in a bidi text.</t> |
---|
1104 | |
---|
1105 | <t>Example 3: All components of an IRI (except for the scheme) are |
---|
1106 | rtl. All rtl components are inverted overall: <vspace/>Logical |
---|
1107 | representation: |
---|
1108 | "http://AB.CD.EF/GH/IJ/KL?MN=OP;QR=ST#UV"<vspace/>Visual |
---|
1109 | representation: "http://VU#TS=RQ;PO=NM?LK/JI/HG/FE.DC.BA"<vspace/> The |
---|
1110 | whole IRI (except the scheme) is read rtl. Delimiters between rtl |
---|
1111 | components stay between the respective components; delimiters between |
---|
1112 | ltr and rtl components don't move.</t> |
---|
1113 | |
---|
1114 | <t>Example 4: Each of several sequences of rtl components is inverted |
---|
1115 | on its own: <vspace/>Logical representation: |
---|
1116 | "http://AB.CD.ef/gh/IJ/KL.html"<vspace/>Visual representation: |
---|
1117 | "http://DC.BA.ef/gh/LK/JI.html"<vspace/> Each sequence of rtl |
---|
1118 | components is read rtl, in the same way as each sequence of rtl words |
---|
1119 | in an ltr text is read rtl.</t> |
---|
1120 | |
---|
1121 | <t>Example 5: Example 2, applied to components of different kinds: |
---|
1122 | <vspace/>Logical representation: "http://ab.cd.EF/GH/ij/kl.html" |
---|
1123 | <vspace/>Visual representation: |
---|
1124 | "http://ab.cd.HG/FE/ij/kl.html"<vspace/> The inversion of the domain |
---|
1125 | name label and the path component may be unexpected, but it is |
---|
1126 | consistent with other bidi behavior. For reassurance that the domain |
---|
1127 | component really is "ab.cd.EF", it may be helpful to read aloud the |
---|
1128 | visual representation following the bidi algorithm. After |
---|
1129 | "http://ab.cd." one reads the RTL block "E-F-slash-G-H", which |
---|
1130 | corresponds to the logical representation. |
---|
1131 | </t> |
---|
1132 | |
---|
1133 | <t>Example 6: Same as Example 5, with more rtl components: |
---|
1134 | <vspace/>Logical representation: |
---|
1135 | "http://ab.CD.EF/GH/IJ/kl.html"<vspace/>Visual representation: |
---|
1136 | "http://ab.JI/HG/FE.DC/kl.html"<vspace/> The inversion of the domain |
---|
1137 | name labels and the path components may be easier to identify because |
---|
1138 | the delimiters also move.</t> |
---|
1139 | |
---|
1140 | <t>Example 7: A single rtl component includes digits: <vspace/>Logical |
---|
1141 | representation: "http://ab.CDE123FGH.ij/kl/mn/op.html"<vspace/>Visual |
---|
1142 | representation: "http://ab.HGF123EDC.ij/kl/mn/op.html"<vspace/> |
---|
1143 | Numbers are written ltr in all cases but are treated as an additional |
---|
1144 | embedding inside a run of rtl characters. This is completely |
---|
1145 | consistent with usual bidirectional text.</t> |
---|
1146 | |
---|
1147 | <t>Example 8 (not allowed): Numbers are at the start or end of an rtl |
---|
1148 | component:<vspace/>Logical representation: |
---|
1149 | "http://ab.cd.ef/GH1/2IJ/KL.html"<vspace/>Visual representation: |
---|
1150 | "http://ab.cd.ef/LK/JI1/2HG.html"<vspace/> The sequence "1/2" is |
---|
1151 | interpreted by the bidi algorithm as a fraction, fragmenting the |
---|
1152 | components and leading to confusion. There are other characters that |
---|
1153 | are interpreted in a special way close to numbers; in particular, "+", |
---|
1154 | "-", "#", "$", "%", ",", ".", and ":".</t> |
---|
1155 | |
---|
1156 | <t>Example 9 (not allowed): The numbers in the previous example are |
---|
1157 | percent-encoded: <vspace/>Logical representation: |
---|
1158 | "http://ab.cd.ef/GH%31/%32IJ/KL.html",<vspace/>Visual representation: |
---|
1159 | "http://ab.cd.ef/LK/JI%32/%31HG.html"</t> |
---|
1160 | |
---|
1161 | <t>Example 10 (allowed but not recommended): <vspace/>Logical |
---|
1162 | representation: "http://ab.CDEFGH.123/kl/mn/op.html"<vspace/>Visual |
---|
1163 | representation: "http://ab.123.HGFEDC/kl/mn/op.html"<vspace/> |
---|
1164 | Components consisting of only numbers are allowed (it would be rather |
---|
1165 | difficult to prohibit them), but these may interact with adjacent RTL |
---|
1166 | components in ways that are not easy to predict.</t> |
---|
1167 | |
---|
1168 | <t>Example 11 (allowed but not recommended): <vspace/>Logical |
---|
1169 | representation: "http://ab.CDEFGH.123ij/kl/mn/op.html"<vspace/>Visual |
---|
1170 | representation: "http://ab.123.HGFEDCij/kl/mn/op.html"<vspace/> |
---|
1171 | Components consisting of numbers and left-to-right characters are |
---|
1172 | allowed, but these may interact with adjacent RTL components in ways |
---|
1173 | that are not easy to predict.</t> |
---|
1174 | </section><!-- examples --> |
---|
1175 | </section><!-- bidi --> |
---|
1176 | |
---|
1177 | <section title="Normalization and Comparison" anchor="equivalence"> |
---|
1178 | |
---|
1179 | <t><list style="hanging"><t hangText="Note:">The structure and much of |
---|
1180 | the material for this section is taken from section 6 of <xref |
---|
1181 | target="RFC3986"></xref>; the differences are due to the specifics |
---|
1182 | of IRIs.</t></list></t> |
---|
1183 | |
---|
1184 | <t>One of the most common operations on IRIs is simple comparison: |
---|
1185 | Determining whether two IRIs are equivalent, without using the IRIs to |
---|
1186 | access their respective resource(s). A comparison is performed |
---|
1187 | whenever a response cache is accessed, a browser checks its history to |
---|
1188 | color a link, or an XML parser processes tags within a |
---|
1189 | namespace. Extensive normalization prior to comparison of IRIs may be |
---|
1190 | used by spiders and indexing engines to prune a search space or reduce |
---|
1191 | duplication of request actions and response storage.</t> |
---|
1192 | |
---|
1193 | <t>IRI comparison is performed for some particular purpose. Protocols |
---|
1194 | or implementations that compare IRIs for different purposes will often |
---|
1195 | be subject to differing design trade-offs in regards to how much |
---|
1196 | effort should be spent in reducing aliased identifiers. This section |
---|
1197 | describes various methods that may be used to compare IRIs, the |
---|
1198 | trade-offs between them, and the types of applications that might use |
---|
1199 | them.</t> |
---|
1200 | |
---|
1201 | <section title="Equivalence"> |
---|
1202 | |
---|
1203 | <t>Because IRIs exist to identify resources, presumably they should be |
---|
1204 | considered equivalent when they identify the same resource. However, |
---|
1205 | this definition of equivalence is not of much practical use, as there |
---|
1206 | is no way for an implementation to compare two resources to determine |
---|
1207 | if they are "the same" unless it has full knowledge or control of |
---|
1208 | them. For this reason, determination of equivalence or difference of |
---|
1209 | IRIs is based on string comparison, perhaps augmented by reference to |
---|
1210 | additional rules provided by URI scheme definitions. We use the terms |
---|
1211 | "different" and "equivalent" to describe the possible outcomes of such |
---|
1212 | comparisons, but there are many application-dependent versions of |
---|
1213 | equivalence.</t> |
---|
1214 | |
---|
1215 | <t>Even when it is possible to determine that two IRIs are equivalent, |
---|
1216 | IRI comparison is not sufficient to determine whether two IRIs |
---|
1217 | identify different resources. For example, an owner of two different |
---|
1218 | domain names could decide to serve the same resource from both, |
---|
1219 | resulting in two different IRIs. Therefore, comparison methods are |
---|
1220 | designed to minimize false negatives while strictly avoiding false |
---|
1221 | positives.</t> |
---|
1222 | |
---|
1223 | <t>In testing for equivalence, applications should not directly |
---|
1224 | compare relative references; the references should be converted to |
---|
1225 | their respective target IRIs before comparison. When IRIs are compared |
---|
1226 | to select (or avoid) a network action, such as retrieval of a |
---|
1227 | representation, fragment components (if any) should be excluded from |
---|
1228 | the comparison.</t> |
---|
1229 | |
---|
1230 | <t>Applications using IRIs as identity tokens with no relationship to |
---|
1231 | a protocol MUST use the Simple String Comparison (see <xref |
---|
1232 | target="stringcomp"></xref>). All other applications MUST select one |
---|
1233 | of the comparison practices from the Comparison Ladder (see <xref |
---|
1234 | target="ladder"></xref>.</t> |
---|
1235 | </section> <!-- equivalence --> |
---|
1236 | |
---|
1237 | |
---|
1238 | <section title="Preparation for Comparison"> |
---|
1239 | <t>Any kind of IRI comparison REQUIRES that any additional contextual |
---|
1240 | processing is first performed, including undoing higher-level |
---|
1241 | escapings or encodings in the protocol or format that carries an |
---|
1242 | IRI. This preprocessing is usually done when the protocol or format is |
---|
1243 | parsed.</t> |
---|
1244 | |
---|
1245 | <t>Examples of contextual preprocessing steps are described in <xref |
---|
1246 | target="LEIRIHREF"/>. </t> |
---|
1247 | |
---|
1248 | <t>Examples of such escapings or encodings are entities and |
---|
1249 | numeric character references in <xref target="HTML4"></xref> and <xref |
---|
1250 | target="XML1"></xref>. As an example, |
---|
1251 | "http://example.org/ros&eacute;" (in HTML), |
---|
1252 | "http://example.org/ros&#233;" (in HTML or XML), and |
---|
1253 | <vspace/>"http://example.org/ros&#xE9;" (in HTML or XML) are all |
---|
1254 | resolved into what is denoted in this document (see <xref |
---|
1255 | target="sec-Notation"></xref>) as "http://example.org/ros&#xE9;" |
---|
1256 | (the "&#xE9;" here standing for the actual e-acute character, to |
---|
1257 | compensate for the fact that this document cannot contain non-ASCII |
---|
1258 | characters).</t> |
---|
1259 | |
---|
1260 | <t>Similar considerations apply to encodings such as Transfer Codings |
---|
1261 | in HTTP (see <xref target="RFC2616"></xref>) and Content Transfer |
---|
1262 | Encodings in MIME (<xref target="RFC2045"></xref>), although in these |
---|
1263 | cases, the encoding is based not on characters but on octets, and |
---|
1264 | additional care is required to make sure that characters, and not just |
---|
1265 | arbitrary octets, are compared (see <xref |
---|
1266 | target="stringcomp"></xref>).</t> |
---|
1267 | |
---|
1268 | </section> <!-- preparation --> |
---|
1269 | |
---|
1270 | <section title="Comparison Ladder" anchor="ladder"> |
---|
1271 | |
---|
1272 | <t>In practice, a variety of methods are used to test IRI |
---|
1273 | equivalence. These methods fall into a range distinguished by the |
---|
1274 | amount of processing required and the degree to which the probability |
---|
1275 | of false negatives is reduced. As noted above, false negatives cannot |
---|
1276 | be eliminated. In practice, their probability can be reduced, but this |
---|
1277 | reduction requires more processing and is not cost-effective for all |
---|
1278 | applications.</t> |
---|
1279 | |
---|
1280 | |
---|
1281 | <t>If this range of comparison practices is considered as a ladder, |
---|
1282 | the following discussion will climb the ladder, starting with |
---|
1283 | practices that are cheap but have a relatively higher chance of |
---|
1284 | producing false negatives, and proceeding to those that have higher |
---|
1285 | computational cost and lower risk of false negatives.</t> |
---|
1286 | |
---|
1287 | <section title="Simple String Comparison" anchor="stringcomp"> |
---|
1288 | |
---|
1289 | <t>If two IRIs, when considered as character strings, are identical, |
---|
1290 | then it is safe to conclude that they are equivalent. This type of |
---|
1291 | equivalence test has very low computational cost and is in wide use in |
---|
1292 | a variety of applications, particularly in the domain of parsing. It |
---|
1293 | is also used when a definitive answer to the question of IRI |
---|
1294 | equivalence is needed that is independent of the scheme used and that |
---|
1295 | can be calculated quickly and without accessing a network. An example |
---|
1296 | of such a case is XML Namespaces (<xref |
---|
1297 | target="XMLNamespace"></xref>).</t> |
---|
1298 | |
---|
1299 | |
---|
1300 | <t>Testing strings for equivalence requires some basic precautions. |
---|
1301 | This procedure is often referred to as "bit-for-bit" or |
---|
1302 | "byte-for-byte" comparison, which is potentially misleading. Testing |
---|
1303 | strings for equality is normally based on pair comparison of the |
---|
1304 | characters that make up the strings, starting from the first and |
---|
1305 | proceeding until both strings are exhausted and all characters are |
---|
1306 | found to be equal, until a pair of characters compares unequal, or |
---|
1307 | until one of the strings is exhausted before the other.</t> |
---|
1308 | |
---|
1309 | <t>This character comparison requires that each pair of characters be |
---|
1310 | put in comparable encoding form. For example, should one IRI be stored |
---|
1311 | in a byte array in UTF-8 encoding form and the second in a UTF-16 |
---|
1312 | encoding form, bit-for-bit comparisons applied naively will produce |
---|
1313 | errors. It is better to speak of equality on a character-for-character |
---|
1314 | rather than on a byte-for-byte or bit-for-bit basis. In practical |
---|
1315 | terms, character-by-character comparisons should be done codepoint by |
---|
1316 | codepoint after conversion to a common character encoding form. |
---|
1317 | |
---|
1318 | When comparing character by character, the comparison function MUST |
---|
1319 | NOT map IRIs to URIs, because such a mapping would create additional |
---|
1320 | spurious equivalences. It follows that an IRI SHOULD NOT be modified |
---|
1321 | when being transported if there is any chance that this IRI might be |
---|
1322 | used in a context that uses Simple String Comparison.</t> |
---|
1323 | |
---|
1324 | |
---|
1325 | <t>False negatives are caused by the production and use of IRI |
---|
1326 | aliases. Unnecessary aliases can be reduced, regardless of the |
---|
1327 | comparison method, by consistently providing IRI references in an |
---|
1328 | already normalized form (i.e., a form identical to what would be |
---|
1329 | produced after normalization is applied, as described below). |
---|
1330 | Protocols and data formats often limit some IRI comparisons to simple |
---|
1331 | string comparison, based on the theory that people and implementations |
---|
1332 | will, in their own best interest, be consistent in providing IRI |
---|
1333 | references, or at least be consistent enough to negate any efficiency |
---|
1334 | that might be obtained from further normalization.</t> |
---|
1335 | </section> <!-- stringcomp --> |
---|
1336 | |
---|
1337 | <section title="Syntax-Based Normalization"> |
---|
1338 | |
---|
1339 | <figure><preamble>Implementations may use logic based on the |
---|
1340 | definitions provided by this specification to reduce the probability |
---|
1341 | of false negatives. This processing is moderately higher in cost than |
---|
1342 | character-for-character string comparison. For example, an application |
---|
1343 | using this approach could reasonably consider the following two IRIs |
---|
1344 | equivalent:</preamble> |
---|
1345 | |
---|
1346 | <artwork> |
---|
1347 | example://a/b/c/%7Bfoo%7D/ros&#xE9; |
---|
1348 | eXAMPLE://a/./b/../b/%63/%7bfoo%7d/ros%C3%A9 |
---|
1349 | </artwork></figure> |
---|
1350 | |
---|
1351 | <t>Web user agents, such as browsers, typically apply this type of IRI |
---|
1352 | normalization when determining whether a cached response is |
---|
1353 | available. Syntax-based normalization includes such techniques as case |
---|
1354 | normalization, character normalization, percent-encoding |
---|
1355 | normalization, and removal of dot-segments.</t> |
---|
1356 | |
---|
1357 | <section title="Case Normalization"> |
---|
1358 | |
---|
1359 | <t>For all IRIs, the hexadecimal digits within a percent-encoding |
---|
1360 | triplet (e.g., "%3a" versus "%3A") are case-insensitive and therefore |
---|
1361 | should be normalized to use uppercase letters for the digits A-F.</t> |
---|
1362 | |
---|
1363 | <t>When an IRI uses components of the generic syntax, the component |
---|
1364 | syntax equivalence rules always apply; namely, that the scheme and |
---|
1365 | US-ASCII only host are case insensitive and therefore should be |
---|
1366 | normalized to lowercase. For example, the URI |
---|
1367 | "HTTP://www.EXAMPLE.com/" is equivalent to |
---|
1368 | "http://www.example.com/". Case equivalence for non-ASCII characters |
---|
1369 | in IRI components that are IDNs are discussed in <xref |
---|
1370 | target="schemecomp"></xref>. The other generic syntax components are |
---|
1371 | assumed to be case sensitive unless specifically defined otherwise by |
---|
1372 | the scheme.</t> |
---|
1373 | |
---|
1374 | <t>Creating schemes that allow case-insensitive syntax components |
---|
1375 | containing non-ASCII characters should be avoided. Case normalization |
---|
1376 | of non-ASCII characters can be culturally dependent and is always a |
---|
1377 | complex operation. The only exception concerns non-ASCII host names |
---|
1378 | for which the character normalization includes a mapping step derived |
---|
1379 | from case folding.</t> |
---|
1380 | |
---|
1381 | </section> <!-- casenorm --> |
---|
1382 | |
---|
1383 | <section title="Character Normalization" anchor="normalization"> |
---|
1384 | |
---|
1385 | <t>The Unicode Standard <xref target="UNIV6"></xref> defines various |
---|
1386 | equivalences between sequences of characters for various |
---|
1387 | purposes. Unicode Standard Annex #15 <xref target="UTR15"></xref> |
---|
1388 | defines various Normalization Forms for these equivalences, in |
---|
1389 | particular Normalization Form C (NFC, Canonical Decomposition, |
---|
1390 | followed by Canonical Composition) and Normalization Form KC (NFKC, |
---|
1391 | Compatibility Decomposition, followed by Canonical Composition).</t> |
---|
1392 | |
---|
1393 | <t> IRIs already in Unicode MUST NOT be normalized before parsing or |
---|
1394 | interpreting. In many non-Unicode character encodings, some text |
---|
1395 | cannot be represented directly. For example, the word "Vietnam" is |
---|
1396 | natively written "Vi&#x1EC7;t Nam" (containing a LATIN SMALL |
---|
1397 | LETTER E WITH CIRCUMFLEX AND DOT BELOW) in NFC, but a direct |
---|
1398 | transcoding from the windows-1258 character encoding leads to |
---|
1399 | "Vi&#xEA;&#x323;t Nam" (containing a LATIN SMALL LETTER E WITH |
---|
1400 | CIRCUMFLEX followed by a COMBINING DOT BELOW). Direct transcoding of |
---|
1401 | other 8-bit encodings of Vietnamese may lead to other |
---|
1402 | representations.</t> |
---|
1403 | |
---|
1404 | <t>Equivalence of IRIs MUST rely on the assumption that IRIs are |
---|
1405 | appropriately pre-character-normalized rather than apply character |
---|
1406 | normalization when comparing two IRIs. The exceptions are conversion |
---|
1407 | from a non-digital form, and conversion from a non-UCS-based character |
---|
1408 | encoding to a UCS-based character encoding. In these cases, NFC or a |
---|
1409 | normalizing transcoder using NFC MUST be used for interoperability. To |
---|
1410 | avoid false negatives and problems with transcoding, IRIs SHOULD be |
---|
1411 | created by using NFC. Using NFKC may avoid even more problems; for |
---|
1412 | example, by choosing half-width Latin letters instead of full-width |
---|
1413 | ones, and full-width instead of half-width Katakana.</t> |
---|
1414 | |
---|
1415 | |
---|
1416 | <t>As an example, |
---|
1417 | "http://www.example.org/r&#xE9;sum&#xE9;.html" (in XML |
---|
1418 | Notation) is in NFC. On the other hand, |
---|
1419 | "http://www.example.org/re&#x301;sume&#x301;.html" is not in |
---|
1420 | NFC.</t> |
---|
1421 | |
---|
1422 | <t>The former uses precombined e-acute characters, and the latter uses |
---|
1423 | "e" characters followed by combining acute accents. Both usages are |
---|
1424 | defined as canonically equivalent in <xref target="UNIV6"></xref>.</t> |
---|
1425 | |
---|
1426 | <t><list style="hanging"> |
---|
1427 | |
---|
1428 | <t hangText="Note:"> |
---|
1429 | Because it is unknown how a particular sequence of characters is being |
---|
1430 | treated with respect to character normalization, it would be |
---|
1431 | inappropriate to allow third parties to normalize an IRI |
---|
1432 | arbitrarily. This does not contradict the recommendation that when a |
---|
1433 | resource is created, its IRI should be as character normalized as |
---|
1434 | possible (i.e., NFC or even NFKC). This is similar to the |
---|
1435 | uppercase/lowercase problems. Some parts of a URI are case |
---|
1436 | insensitive (for example, the domain name). For others, it is unclear |
---|
1437 | whether they are case sensitive, case insensitive, or something in |
---|
1438 | between (e.g., case sensitive, but with a multiple choice selection if |
---|
1439 | the wrong case is used, instead of a direct negative result). The |
---|
1440 | best recipe is that the creator use a reasonable capitalization and, |
---|
1441 | when transferring the URI, capitalization never be |
---|
1442 | changed.</t></list></t> |
---|
1443 | |
---|
1444 | <t>Various IRI schemes may allow the usage of Internationalized Domain |
---|
1445 | Names (IDN) <xref target="RFC3490"></xref> either in the ireg-name |
---|
1446 | part or elsewhere. Character Normalization also applies to IDNs, as |
---|
1447 | discussed in <xref target="schemecomp"></xref>.</t> |
---|
1448 | </section> <!-- charnorm --> |
---|
1449 | |
---|
1450 | <section title="Percent-Encoding Normalization"> |
---|
1451 | |
---|
1452 | <t>The percent-encoding mechanism (Section 2.1 of <xref |
---|
1453 | target="RFC3986"></xref>) is a frequent source of variance among |
---|
1454 | otherwise identical IRIs. In addition to the case normalization issue |
---|
1455 | noted above, some IRI producers percent-encode octets that do not |
---|
1456 | require percent-encoding, resulting in IRIs that are equivalent to |
---|
1457 | their nonencoded counterparts. These IRIs should be normalized by |
---|
1458 | decoding any percent-encoded octet sequence that corresponds to an |
---|
1459 | unreserved character, as described in section 2.3 of <xref |
---|
1460 | target="RFC3986"></xref>.</t> |
---|
1461 | |
---|
1462 | <t>For actual resolution, differences in percent-encoding (except for |
---|
1463 | the percent-encoding of reserved characters) MUST always result in the |
---|
1464 | same resource. For example, "http://example.org/~user", |
---|
1465 | "http://example.org/%7euser", and "http://example.org/%7Euser", must |
---|
1466 | resolve to the same resource.</t> |
---|
1467 | |
---|
1468 | <t>If this kind of equivalence is to be tested, the percent-encoding |
---|
1469 | of both IRIs to be compared has to be aligned; for example, by |
---|
1470 | converting both IRIs to URIs (see Section 3.1), eliminating escape |
---|
1471 | differences in the resulting URIs, and making sure that the case of |
---|
1472 | the hexadecimal characters in the percent-encoding is always the same |
---|
1473 | (preferably upper case). If the IRI is to be passed to another |
---|
1474 | application or used further in some other way, its original form MUST |
---|
1475 | be preserved. The conversion described here should be performed only |
---|
1476 | for local comparison.</t> |
---|
1477 | |
---|
1478 | </section> <!-- pctnorm --> |
---|
1479 | |
---|
1480 | <section title="Path Segment Normalization"> |
---|
1481 | |
---|
1482 | <t>The complete path segments "." and ".." are intended only for use |
---|
1483 | within relative references (Section 4.1 of <xref |
---|
1484 | target="RFC3986"></xref>) and are removed as part of the reference |
---|
1485 | resolution process (Section 5.2 of <xref target="RFC3986"></xref>). |
---|
1486 | However, some implementations may incorrectly assume that reference |
---|
1487 | resolution is not necessary when the reference is already an IRI, and |
---|
1488 | thus fail to remove dot-segments when they occur in non-relative |
---|
1489 | paths. IRI normalizers should remove dot-segments by applying the |
---|
1490 | remove_dot_segments algorithm to the path, as described in Section |
---|
1491 | 5.2.4 of <xref target="RFC3986"></xref>.</t> |
---|
1492 | |
---|
1493 | </section> <!-- pathnorm --> |
---|
1494 | </section> <!-- ladder --> |
---|
1495 | |
---|
1496 | <section title="Scheme-Based Normalization" anchor="schemecomp"> |
---|
1497 | |
---|
1498 | <t>The syntax and semantics of IRIs vary from scheme to scheme, as |
---|
1499 | described by the defining specification for each |
---|
1500 | scheme. Implementations may use scheme-specific rules, at further |
---|
1501 | processing cost, to reduce the probability of false negatives. For |
---|
1502 | example, because the "http" scheme makes use of an authority |
---|
1503 | component, has a default port of "80", and defines an empty path to be |
---|
1504 | equivalent to "/", the following four IRIs are equivalent:</t> |
---|
1505 | |
---|
1506 | <figure><artwork> |
---|
1507 | http://example.com |
---|
1508 | http://example.com/ |
---|
1509 | http://example.com:/ |
---|
1510 | http://example.com:80/</artwork></figure> |
---|
1511 | |
---|
1512 | <t>In general, an IRI that uses the generic syntax for authority with |
---|
1513 | an empty path should be normalized to a path of "/". Likewise, an |
---|
1514 | explicit ":port", for which the port is empty or the default for the |
---|
1515 | scheme, is equivalent to one where the port and its ":" delimiter are |
---|
1516 | elided and thus should be removed by scheme-based normalization. For |
---|
1517 | example, the second IRI above is the normal form for the "http" |
---|
1518 | scheme.</t> |
---|
1519 | |
---|
1520 | <t>Another case where normalization varies by scheme is in the |
---|
1521 | handling of an empty authority component or empty host |
---|
1522 | subcomponent. For many scheme specifications, an empty authority or |
---|
1523 | host is considered an error; for others, it is considered equivalent |
---|
1524 | to "localhost" or the end-user's host. When a scheme defines a default |
---|
1525 | for authority and an IRI reference to that default is desired, the |
---|
1526 | reference should be normalized to an empty authority for the sake of |
---|
1527 | uniformity, brevity, and internationalization. If, however, either the |
---|
1528 | userinfo or port subcomponents are non-empty, then the host should be |
---|
1529 | given explicitly even if it matches the default.</t> |
---|
1530 | |
---|
1531 | <t>Normalization should not remove delimiters when their associated |
---|
1532 | component is empty unless it is licensed to do so by the scheme |
---|
1533 | specification. For example, the IRI "http://example.com/?" cannot be |
---|
1534 | assumed to be equivalent to any of the examples above. Likewise, the |
---|
1535 | presence or absence of delimiters within a userinfo subcomponent is |
---|
1536 | usually significant to its interpretation. The fragment component is |
---|
1537 | not subject to any scheme-based normalization; thus, two IRIs that |
---|
1538 | differ only by the suffix "#" are considered different regardless of |
---|
1539 | the scheme.</t> |
---|
1540 | |
---|
1541 | <t>Some IRI schemes allow the usage of Internationalized Domain |
---|
1542 | Names (IDN) <xref target='RFC5890'></xref> either in their ireg-name |
---|
1543 | part or elswhere. When in use in IRIs, those names SHOULD |
---|
1544 | conform to the definition of U-Label in <xref |
---|
1545 | target='RFC5890'></xref>. An IRI containing an invalid IDN cannot |
---|
1546 | successfully be resolved. For legibility purposes, they |
---|
1547 | SHOULD NOT be converted into ASCII Compatible Encoding (ACE).</t> |
---|
1548 | |
---|
1549 | <t>Scheme-based normalization may also consider IDN |
---|
1550 | components and their conversions to punycode as equivalent. As an |
---|
1551 | example, "http://r&#xE9;sum&#xE9;.example.org" may be |
---|
1552 | considered equivalent to |
---|
1553 | "http://xn--rsum-bpad.example.org".</t><t>Other scheme-specific |
---|
1554 | normalizations are possible.</t> |
---|
1555 | |
---|
1556 | </section> <!-- schemenorm --> |
---|
1557 | |
---|
1558 | <section title="Protocol-Based Normalization"> |
---|
1559 | |
---|
1560 | <t>Substantial effort to reduce the incidence of false negatives is |
---|
1561 | often cost-effective for web spiders. Consequently, they implement |
---|
1562 | even more aggressive techniques in IRI comparison. For example, if |
---|
1563 | they observe that an IRI such as</t> |
---|
1564 | |
---|
1565 | <figure><artwork> |
---|
1566 | http://example.com/data</artwork></figure> |
---|
1567 | <t>redirects to an IRI differing only in the trailing slash</t> |
---|
1568 | <figure><artwork> |
---|
1569 | http://example.com/data/</artwork></figure> |
---|
1570 | |
---|
1571 | <t>they will likely regard the two as equivalent in the future. This |
---|
1572 | kind of technique is only appropriate when equivalence is clearly |
---|
1573 | indicated by both the result of accessing the resources and the common |
---|
1574 | conventions of their scheme's dereference algorithm (in this case, use |
---|
1575 | of redirection by HTTP origin servers to avoid problems with relative |
---|
1576 | references).</t> |
---|
1577 | |
---|
1578 | </section> <!-- protonorm --> |
---|
1579 | </section> <!-- equivalence --> |
---|
1580 | </section> |
---|
1581 | |
---|
1582 | <section title="Use of IRIs" anchor="IRIuse"> |
---|
1583 | |
---|
1584 | <section title="Limitations on UCS Characters Allowed in IRIs" anchor="limitations"> |
---|
1585 | |
---|
1586 | <t>This section discusses limitations on characters and character |
---|
1587 | sequences usable for IRIs beyond those given in <xref target="abnf"/> |
---|
1588 | and <xref target="visual"/>. The considerations in this section are |
---|
1589 | relevant when IRIs are created and when URIs are converted to |
---|
1590 | IRIs.</t> |
---|
1591 | |
---|
1592 | <t> |
---|
1593 | |
---|
1594 | <list style="hanging"><t hangText="a.">The repertoire of characters allowed |
---|
1595 | in each IRI component is limited by the definition of that component. |
---|
1596 | For example, the definition of the scheme component does not allow |
---|
1597 | characters beyond US-ASCII. |
---|
1598 | <vspace blankLines="1"/> |
---|
1599 | (Note: In accordance with URI practice, generic IRI |
---|
1600 | software cannot and should not check for such limitations.)</t> |
---|
1601 | |
---|
1602 | <t hangText="b.">The UCS contains many areas of characters for which |
---|
1603 | there are strong visual look-alikes. Because of the likelihood of |
---|
1604 | transcription errors, these also should be avoided. This includes |
---|
1605 | the full-width equivalents of Latin characters, half-width |
---|
1606 | Katakana characters for Japanese, and many others. It also |
---|
1607 | includes many look-alikes of "space", "delims", and "unwise", |
---|
1608 | characters excluded in <xref target="RFC3491"/>.</t> |
---|
1609 | |
---|
1610 | </list> |
---|
1611 | </t> |
---|
1612 | |
---|
1613 | <t>Additional information is available from <xref target="UNIXML"/>. |
---|
1614 | <xref target="UNIXML"/> is written in the context of running text |
---|
1615 | rather than in that of identifiers. Nevertheless, it discusses |
---|
1616 | many of the categories of characters not appropriate for IRIs.</t> |
---|
1617 | </section> <!-- limitations --> |
---|
1618 | |
---|
1619 | <section title="Software Interfaces and Protocols"> |
---|
1620 | |
---|
1621 | <t>Although an IRI is defined as a sequence of characters, software |
---|
1622 | interfaces for URIs typically function on sequences of octets or other |
---|
1623 | kinds of code units. Thus, software interfaces and protocols MUST |
---|
1624 | define which character encoding is used.</t> |
---|
1625 | |
---|
1626 | <t>Intermediate software interfaces between IRI-capable components and |
---|
1627 | URI-only components MUST map the IRIs per <xref target="mapping"/>, |
---|
1628 | when transferring from IRI-capable to URI-only components. |
---|
1629 | |
---|
1630 | This mapping SHOULD be applied as late as possible. It SHOULD NOT be |
---|
1631 | applied between components that are known to be able to handle IRIs.</t> |
---|
1632 | </section> <!-- software --> |
---|
1633 | |
---|
1634 | <section title="Format of URIs and IRIs in Documents and Protocols"> |
---|
1635 | |
---|
1636 | <t>Document formats that transport URIs may have to be upgraded to allow |
---|
1637 | the transport of IRIs. In cases where the document as a whole |
---|
1638 | has a native character encoding, IRIs MUST also be encoded in this |
---|
1639 | character encoding and converted accordingly by a parser or interpreter. |
---|
1640 | |
---|
1641 | IRI characters not expressible in the native character encoding SHOULD |
---|
1642 | be escaped by using the escaping conventions of the document format if |
---|
1643 | such conventions are available. Alternatively, they MAY be |
---|
1644 | percent-encoded according to <xref target="mapping"/>. For example, in |
---|
1645 | HTML or XML, numeric character references SHOULD be used. If a |
---|
1646 | document as a whole has a native character encoding and that character |
---|
1647 | encoding is not UTF-8, then IRIs MUST NOT be placed into the document |
---|
1648 | in the UTF-8 character encoding.</t> |
---|
1649 | |
---|
1650 | <t>((UPDATE THIS NOTE)) Note: Some formats already accommodate IRIs, |
---|
1651 | although they use different terminology. HTML 4.0 <xref |
---|
1652 | target="HTML4"/> defines the conversion from IRIs to URIs as |
---|
1653 | error-avoiding behavior. XML 1.0 <xref target="XML1"/>, XLink <xref |
---|
1654 | target="XLink"/>, XML Schema <xref target="XMLSchema"/>, and |
---|
1655 | specifications based upon them allow IRIs. Also, it is expected that |
---|
1656 | all relevant new W3C formats and protocols will be required to handle |
---|
1657 | IRIs <xref target="CharMod"/>.</t> |
---|
1658 | |
---|
1659 | </section> <!-- format --> |
---|
1660 | |
---|
1661 | <section title="Use of UTF-8 for Encoding Original Characters" anchor="UTF8use"> |
---|
1662 | |
---|
1663 | <t>This section discusses details and gives examples for point c) in |
---|
1664 | <xref target="Applicability"/>. To be able to use IRIs, the URI |
---|
1665 | corresponding to the IRI in question has to encode original characters |
---|
1666 | into octets by using UTF-8. This can be specified for all URIs of a |
---|
1667 | URI scheme or can apply to individual URIs for schemes that do not |
---|
1668 | specify how to encode original characters. It can apply to the whole |
---|
1669 | URI, or only to some part. For background information on encoding |
---|
1670 | characters into URIs, see also Section 2.5 of <xref |
---|
1671 | target="RFC3986"/>.</t> |
---|
1672 | |
---|
1673 | <t>For new URI schemes, using UTF-8 is recommended in <xref |
---|
1674 | target="RFC4395bis"/>. Examples where UTF-8 is already used are the URN |
---|
1675 | syntax <xref target="RFC2141"/>, IMAP URLs <xref target="RFC2192"/>, |
---|
1676 | and POP URLs <xref target="RFC2384"/>. On the other hand, because the |
---|
1677 | HTTP URI scheme does not specify how to encode original characters, |
---|
1678 | only some HTTP URLs can have corresponding but different IRIs.</t> |
---|
1679 | |
---|
1680 | <t>For example, for a document with a URI |
---|
1681 | of<vspace/>"http://www.example.org/r%C3%A9sum%C3%A9.html", it is |
---|
1682 | possible to construct a corresponding IRI (in XML notation, see <xref |
---|
1683 | target="sec-Notation"/>): |
---|
1684 | "http://www.example.org/r&#xE9;sum&#xE9;.html" ("&#xE9;" |
---|
1685 | stands for the e-acute character, and "%C3%A9" is the UTF-8 encoded |
---|
1686 | and percent-encoded representation of that character). On the other |
---|
1687 | hand, for a document with a URI of |
---|
1688 | "http://www.example.org/r%E9sum%E9.html", the percent-encoding octets |
---|
1689 | cannot be converted to actual characters in an IRI, as the |
---|
1690 | percent-encoding is not based on UTF-8.</t> |
---|
1691 | |
---|
1692 | <t>For most URI schemes, there is no need to upgrade their scheme |
---|
1693 | definition in order for them to work with IRIs. The main case where |
---|
1694 | upgrading makes sense is when a scheme definition, or a particular |
---|
1695 | component of a scheme, is strictly limited to the use of US-ASCII |
---|
1696 | characters with no provision to include non-ASCII characters/octets |
---|
1697 | via percent-encoding, or if a scheme definition currently uses highly |
---|
1698 | scheme-specific provisions for the encoding of non-ASCII characters. |
---|
1699 | An example of this is the mailto: scheme <xref target="RFC2368"/>.</t> |
---|
1700 | |
---|
1701 | <t>This specification updates the IANA registry of URI schemes to note |
---|
1702 | their applicability to IRIs, see <xref target="iana"/>. All IRIs use |
---|
1703 | URI schemes, and all URIs with URI schemes can be used as IRIs, even |
---|
1704 | though in some cases only by using URIs directly as IRIs, without any |
---|
1705 | conversion.</t> |
---|
1706 | |
---|
1707 | <t>Scheme definitions can impose restrictions on the syntax of |
---|
1708 | scheme-specific URIs; i.e., URIs that are admissible under the generic |
---|
1709 | URI syntax <xref target="RFC3986"/> may not be admissible due to |
---|
1710 | narrower syntactic constraints imposed by a URI scheme |
---|
1711 | specification. URI scheme definitions cannot broaden the syntactic |
---|
1712 | restrictions of the generic URI syntax; otherwise, it would be |
---|
1713 | possible to generate URIs that satisfied the scheme-specific syntactic |
---|
1714 | constraints without satisfying the syntactic constraints of the |
---|
1715 | generic URI syntax. However, additional syntactic constraints imposed |
---|
1716 | by URI scheme specifications are applicable to IRI, as the |
---|
1717 | corresponding URI resulting from the mapping defined in <xref |
---|
1718 | target="mapping"/> MUST be a valid URI under the syntactic |
---|
1719 | restrictions of generic URI syntax and any narrower restrictions |
---|
1720 | imposed by the corresponding URI scheme specification.</t> |
---|
1721 | |
---|
1722 | <t>The requirement for the use of UTF-8 generally applies to all parts |
---|
1723 | of a URI. However, it is possible that the capability of IRIs to |
---|
1724 | represent a wide range of characters directly is used just in some |
---|
1725 | parts of the IRI (or IRI reference). The other parts of the IRI may |
---|
1726 | only contain US-ASCII characters, or they may not be based on |
---|
1727 | UTF-8. They may be based on another character encoding, or they may |
---|
1728 | directly encode raw binary data (see also <xref |
---|
1729 | target="RFC2397"/>). </t> |
---|
1730 | |
---|
1731 | <t>For example, it is possible to have a URI reference |
---|
1732 | of<vspace/>"http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", |
---|
1733 | where the document name is encoded in iso-8859-1 based on server |
---|
1734 | settings, but where the fragment identifier is encoded in UTF-8 according |
---|
1735 | to <xref target="XPointer"/>. The IRI corresponding to the above |
---|
1736 | URI would be (in XML notation)<vspace/>"http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;".</t> |
---|
1737 | |
---|
1738 | <t>Similar considerations apply to query parts. The functionality |
---|
1739 | of IRIs (namely, to be able to include non-ASCII characters) can |
---|
1740 | only be used if the query part is encoded in UTF-8.</t> |
---|
1741 | |
---|
1742 | </section> <!-- utf8 --> |
---|
1743 | |
---|
1744 | <section title="Relative IRI References"> |
---|
1745 | <t>Processing of relative IRI references against a base is handled |
---|
1746 | straightforwardly; the algorithms of <xref target="RFC3986"/> can |
---|
1747 | be applied directly, treating the characters additionally allowed |
---|
1748 | in IRI references in the same way that unreserved characters are in URI |
---|
1749 | references.</t> |
---|
1750 | |
---|
1751 | </section> <!-- relative --> |
---|
1752 | </section> <!-- IRIuse --> |
---|
1753 | |
---|
1754 | <section title="Liberal Handling of Otherwise Invalid IRIs" anchor="LEIRIHREF"> |
---|
1755 | |
---|
1756 | <t>(EDITOR NOTE: This Section may move to an appendix.) |
---|
1757 | |
---|
1758 | Some technical specifications and widely-deployed software have |
---|
1759 | allowed additional variations and extensions of IRIs to be used in |
---|
1760 | syntactic components. This section describes two widely-used |
---|
1761 | preprocessing agreements. Other technical specifications may wish to |
---|
1762 | reference a syntactic component which is "a valid IRI or a string that |
---|
1763 | will map to a valid IRI after this preprocessing algorithm". These two |
---|
1764 | variants are known as <xref target="LEIRI">Legacy Extended IRI or |
---|
1765 | LEIRI</xref>, and <xref target="HTML5">Web Address</xref>). |
---|
1766 | </t> |
---|
1767 | |
---|
1768 | <t>Future technical specifications SHOULD NOT allow conforming |
---|
1769 | producers to produce, or conforming content to contain, such forms, |
---|
1770 | as they are not interoperable with other IRI consuming software.</t> |
---|
1771 | |
---|
1772 | <section title="LEIRI Processing" anchor="LEIRIspec"> |
---|
1773 | <t>This section defines Legacy Extended IRIs (LEIRIs). |
---|
1774 | The syntax of Legacy Extended IRIs is the same as that for <IRI-reference>, |
---|
1775 | except that the ucschar production is replaced by the leiri-ucschar production:</t> |
---|
1776 | <figure> |
---|
1777 | |
---|
1778 | <artwork> |
---|
1779 | leiri-ucschar = " " / "<" / ">" / '"' / "{" / "}" / "|" |
---|
1780 | / "\" / "^" / "`" / %x0-1F / %x7F-D7FF |
---|
1781 | / %xE000-FFFD / %x10000-10FFFF |
---|
1782 | </artwork> |
---|
1783 | |
---|
1784 | <postamble> |
---|
1785 | Among other extensions, processors based on this specification also |
---|
1786 | did not enforce the restriction on bidirectional formatting |
---|
1787 | characters in <xref target="visual"></xref>, and the iprivate |
---|
1788 | production becomes redundant.</postamble> |
---|
1789 | </figure> |
---|
1790 | |
---|
1791 | <t>To convert a string allowed as a LEIRI to an IRI, each character |
---|
1792 | allowed in leiri-ucschar but not in ucschar must be percent-encoded |
---|
1793 | using <xref target="compmapping"/>.</t> |
---|
1794 | </section> <!-- leiriproc --> |
---|
1795 | |
---|
1796 | <section title="Web Address Processing" anchor="webaddress"> |
---|
1797 | |
---|
1798 | <t>Many popular web browsers have taken the approach of being quite |
---|
1799 | liberal in what is accepted as a "URL" or its relative |
---|
1800 | forms. This section describes their behavior in terms of a preprocessor |
---|
1801 | which maps strings into the IRI space for subsequent parsing and |
---|
1802 | interpretation as an IRI.</t> |
---|
1803 | |
---|
1804 | <t>In some situations, it might be appropriate to describe the syntax |
---|
1805 | that a liberal consumer implementation might accept as a "Web |
---|
1806 | Address" or "Hypertext Reference" or "HREF". However, |
---|
1807 | technical specifications SHOULD restrict the syntactic form allowed by compliant producers |
---|
1808 | to the IRI or IRI reference syntax defined in this document |
---|
1809 | even if they want to mandate this processing.</t> |
---|
1810 | |
---|
1811 | <t> |
---|
1812 | Summary: |
---|
1813 | <list style="symbols"> |
---|
1814 | <t>Leading and trailing whitespace is removed.</t> |
---|
1815 | <t>Some additional characters are removed.</t> |
---|
1816 | <t>Some additional characters are allowed and escaped (as with LEIRI).</t> |
---|
1817 | <t>If interpreting an IRI as a URI, the pct-encoding of the query |
---|
1818 | component of the parsed URI component depends on operational |
---|
1819 | context.</t> |
---|
1820 | </list> |
---|
1821 | </t> |
---|
1822 | |
---|
1823 | <t>Each string provided may have an associated charset (called |
---|
1824 | the HREF-charset here); this defaults to UTF-8. |
---|
1825 | For web browsers interpreting HTML, the document |
---|
1826 | charset of a string is determined: |
---|
1827 | |
---|
1828 | <list style="hanging"> |
---|
1829 | <t hangText="If the string came from a script (e.g. as an argument to |
---|
1830 | a method)">The HRef-charset is the script's charset.</t> |
---|
1831 | |
---|
1832 | <t hangText="If the string came from a DOM node (e.g. from an |
---|
1833 | element)">The node has a Document, and the HRef-charset is the |
---|
1834 | Document's character encoding.</t> |
---|
1835 | |
---|
1836 | <t hangText="If the string had a HRef-charset defined when the string was |
---|
1837 | created or defined">The HRef-charset is as defined.</t> |
---|
1838 | |
---|
1839 | </list></t> |
---|
1840 | |
---|
1841 | <t>If the resulting HRef-charset is a unicode based character encoding |
---|
1842 | (e.g., UTF-16), then use UTF-8 instead.</t> |
---|
1843 | |
---|
1844 | |
---|
1845 | <figure> |
---|
1846 | <preamble>The syntax for Web Addresses is obtained by replacing the 'ucschar', |
---|
1847 | pct-form, and path-sep rules with the href-ucschar, href-pct-form, and href-path-sep |
---|
1848 | rules below. In addition, some characters are stripped.</preamble> |
---|
1849 | |
---|
1850 | <artwork type='abnf'> |
---|
1851 | href-ucschar = " " / "<" / ">" / DQUOTE / "{" / "}" / "|" |
---|
1852 | / "\" / "^" / "`" / %x0-1F / %x7F-D7FF |
---|
1853 | / %xE000-FFFD / %x10000-10FFFF |
---|
1854 | href-pct-form = pct-encoded / "%" |
---|
1855 | href-path-sep = "/" / "\" |
---|
1856 | href-strip = <to be done> |
---|
1857 | </artwork> |
---|
1858 | |
---|
1859 | <postamble> |
---|
1860 | (NOTE: NEED TO FIX THESE SETS TO MATCH HTML5; NOT SURE ABOUT NEXT SENTENCE) |
---|
1861 | browsers did not enforce the restriction on bidirectional formatting |
---|
1862 | characters in <xref target="visual"></xref>, and the iprivate |
---|
1863 | production becomes redundant.</postamble> |
---|
1864 | </figure> |
---|
1865 | |
---|
1866 | <t>'Web Address processing' requires the following additional |
---|
1867 | preprocessing steps: |
---|
1868 | |
---|
1869 | <list style="numbers"> |
---|
1870 | |
---|
1871 | <t>Leading and trailing instances of space (U+0020), |
---|
1872 | CR (U+000A), LF (U+000D), and TAB (U+0009) characters are removed.</t> |
---|
1873 | |
---|
1874 | <t>strip all characters in href-strip.</t> |
---|
1875 | <t>Percent-encode all characters in href-ucschar not in ucschar.</t> |
---|
1876 | <t>Replace occurrences of "%" not followed by two hexadecimal digits by "%25".</t> |
---|
1877 | <t>Convert backslashes ('\') matching href-path-sep to forward slashes ('/').</t> |
---|
1878 | </list></t> |
---|
1879 | </section> <!-- webaddress --> |
---|
1880 | |
---|
1881 | <section title="Characters Not Allowed in IRIs" anchor="notAllowed"> |
---|
1882 | |
---|
1883 | <t>This section provides a list of the groups of characters and code |
---|
1884 | points that are allowed by LEIRI or HREF but are not allowed in IRIs or are |
---|
1885 | allowed in IRIs only in the query part. For each group of characters, |
---|
1886 | advice on the usage of these characters is also given, concentrating |
---|
1887 | on the reasons for why they are excluded from IRI use.</t> |
---|
1888 | |
---|
1889 | <t> |
---|
1890 | |
---|
1891 | <list><t>Space (U+0020): Some formats and applications use space as a |
---|
1892 | delimiter, e.g. for items in a list. Appendix C of <xref |
---|
1893 | target="RFC3986"></xref> also mentions that white space may have to be |
---|
1894 | added when displaying or printing long URIs; the same applies to long |
---|
1895 | IRIs. This means that spaces can disappear, or can make the what is |
---|
1896 | intended as a single IRI or IRI reference to be treated as two or more |
---|
1897 | separate IRIs.</t> |
---|
1898 | |
---|
1899 | <t>Delimiters "<" (U+003C), ">" (U+003E), and '"' (U+0022): |
---|
1900 | Appendix C of <xref target="RFC3986"></xref> suggests the use of |
---|
1901 | double-quotes ("http://example.com/") and angle brackets |
---|
1902 | (<http://example.com/>) as delimiters for URIs in plain |
---|
1903 | text. These conventions are often used, and also apply to IRIs. Using |
---|
1904 | these characters in strings intended to be IRIs would result in the |
---|
1905 | IRIs being cut off at the wrong place.</t> |
---|
1906 | |
---|
1907 | <t>Unwise characters "\" (U+005C), "^" (U+005E), "`" |
---|
1908 | (U+0060), "{" (U+007B), "|" (U+007C), and "}" (U+007D): These |
---|
1909 | characters originally have been excluded from URIs because the |
---|
1910 | respective codepoints are assigned to different graphic characters in |
---|
1911 | some 7-bit or 8-bit encoding. Despite the move to Unicode, some of |
---|
1912 | these characters are still occasionally displayed differently on some |
---|
1913 | systems, e.g. U+005C may appear as a Japanese Yen symbol on some |
---|
1914 | systems. Also, the fact that these characters are not used in URIs or |
---|
1915 | IRIs has encouraged their use outside URIs or IRIs in contexts that |
---|
1916 | may include URIs or IRIs. If a string with such a character were used |
---|
1917 | as an IRI in such a context, it would likely be interpreted |
---|
1918 | piecemeal.</t> |
---|
1919 | |
---|
1920 | <t>The controls (C0 controls, DEL, and C1 controls, #x0 - #x1F #x7F - |
---|
1921 | #x9F): There is generally no way to transmit these characters reliably |
---|
1922 | as text outside of a charset encoding. Even when in encoded form, |
---|
1923 | many software components silently filter out some of these characters, |
---|
1924 | or may stop processing alltogether when encountering some of |
---|
1925 | them. These characters may affect text display in subtle, unnoticable |
---|
1926 | ways or in drastic, global, and irreversible ways depending on the |
---|
1927 | hardware and software involved. The use of some of these characters |
---|
1928 | would allow malicious users to manipulate the display of an IRI and |
---|
1929 | its context in many situations.</t> |
---|
1930 | |
---|
1931 | <t>Bidi formatting characters (U+200E, U+200F, U+202A-202E): These |
---|
1932 | characters affect the display ordering of characters. If IRIs were |
---|
1933 | allowed to contain these characters and the resulting visual display |
---|
1934 | transcribed. they could not be converted back to electronic form |
---|
1935 | (logical order) unambiguously. These characters, if allowed in IRIs, |
---|
1936 | might allow malicious users to manipulate the display of IRI and its |
---|
1937 | context.</t> |
---|
1938 | |
---|
1939 | <t>Specials (U+FFF0-FFFD): These code points provide functionality |
---|
1940 | beyond that useful in an IRI, for example byte order identification, |
---|
1941 | annotation, and replacements for unknown characters and objects. Their |
---|
1942 | use and interpretation in an IRI would serve no purpose and might lead |
---|
1943 | to confusing display variations.</t> |
---|
1944 | |
---|
1945 | <t>Private use code points (U+E000-F8FF, U+F0000-FFFFD, |
---|
1946 | U+100000-10FFFD): Display and interpretation of these code points is |
---|
1947 | by definition undefined without private agreement. Therefore, these |
---|
1948 | code points are not suited for use on the Internet. They are not |
---|
1949 | interoperable and may have unpredictable effects.</t> |
---|
1950 | |
---|
1951 | <t>Tags (U+E0000-E0FFF): These characters provide a way to language |
---|
1952 | tag in Unicode plain text. They are not appropriate for IRIs because |
---|
1953 | language information in identifiers cannot reliably be input, |
---|
1954 | transmitted (e.g. on a visual medium such as paper), or |
---|
1955 | recognized.</t> |
---|
1956 | |
---|
1957 | <t>Non-characters (U+FDD0-FDEF, U+1FFFE-1FFFF, U+2FFFE-2FFFF, |
---|
1958 | U+3FFFE-3FFFF, U+4FFFE-4FFFF, U+5FFFE-5FFFF, U+6FFFE-6FFFF, |
---|
1959 | U+7FFFE-7FFFF, U+8FFFE-8FFFF, U+9FFFE-9FFFF, U+AFFFE-AFFFF, |
---|
1960 | U+BFFFE-BFFFF, U+CFFFE-CFFFF, U+DFFFE-DFFFF, U+EFFFE-EFFFF, |
---|
1961 | U+FFFFE-FFFFF, U+10FFFE-10FFFF): These code points are defined as |
---|
1962 | non-characters. Applications may use some of them internally, but are |
---|
1963 | not prepared to interchange them.</t> |
---|
1964 | |
---|
1965 | </list></t> |
---|
1966 | |
---|
1967 | <t>LEIRI preprocessing disallowed some code points and |
---|
1968 | code units: |
---|
1969 | |
---|
1970 | <list><t>Surrogate code units (D800-DFFF): These do not represent |
---|
1971 | Unicode codepoints.</t></list></t> |
---|
1972 | </section> <!-- notallowed --> |
---|
1973 | </section> <!-- lieirihref --> |
---|
1974 | |
---|
1975 | <section title="URI/IRI Processing Guidelines (Informative)" anchor="guidelines"> |
---|
1976 | |
---|
1977 | <t>This informative section provides guidelines for supporting IRIs in |
---|
1978 | the same software components and operations that currently process |
---|
1979 | URIs: Software interfaces that handle URIs, software that allows users |
---|
1980 | to enter URIs, software that creates or generates URIs, software that |
---|
1981 | displays URIs, formats and protocols that transport URIs, and software |
---|
1982 | that interprets URIs. These may all require modification before |
---|
1983 | functioning properly with IRIs. The considerations in this section |
---|
1984 | also apply to URI references and IRI references.</t> |
---|
1985 | |
---|
1986 | <section title="URI/IRI Software Interfaces"> |
---|
1987 | <t>Software interfaces that handle URIs, such as URI-handling APIs and |
---|
1988 | protocols transferring URIs, need interfaces and protocol elements |
---|
1989 | that are designed to carry IRIs.</t> |
---|
1990 | |
---|
1991 | <t>In case the current handling in an API or protocol is based on |
---|
1992 | US-ASCII, UTF-8 is recommended as the character encoding for IRIs, as |
---|
1993 | it is compatible with US-ASCII, is in accordance with the |
---|
1994 | recommendations of <xref target="RFC2277"/>, and makes converting to |
---|
1995 | URIs easy. In any case, the API or protocol definition must clearly |
---|
1996 | define the character encoding to be used.</t> |
---|
1997 | |
---|
1998 | <t>The transfer from URI-only to IRI-capable components requires no |
---|
1999 | mapping, although the conversion described in <xref |
---|
2000 | target="URItoIRI"/> above may be performed. It is preferable not to |
---|
2001 | perform this inverse conversion unless it is certain this can be done |
---|
2002 | correctly.</t> |
---|
2003 | </section> |
---|
2004 | |
---|
2005 | <section title="URI/IRI Entry"> |
---|
2006 | |
---|
2007 | <t>Some components allow users to enter URIs into the system |
---|
2008 | by typing or dictation, for example. This software must be updated to allow |
---|
2009 | for IRI entry.</t> |
---|
2010 | |
---|
2011 | <t>A person viewing a visual representation of an IRI (as a sequence |
---|
2012 | of glyphs, in some order, in some visual display) or hearing an IRI |
---|
2013 | will use an entry method for characters in the user's language to |
---|
2014 | input the IRI. Depending on the script and the input method used, this |
---|
2015 | may be a more or less complicated process.</t> |
---|
2016 | |
---|
2017 | <t>The process of IRI entry must ensure, as much as possible, that the |
---|
2018 | restrictions defined in <xref target="abnf"/> are met. This may be |
---|
2019 | done by choosing appropriate input methods or variants/settings |
---|
2020 | thereof, by appropriately converting the characters being input, by |
---|
2021 | eliminating characters that cannot be converted, and/or by issuing a |
---|
2022 | warning or error message to the user.</t> |
---|
2023 | |
---|
2024 | <t>As an example of variant settings, input method editors for East |
---|
2025 | Asian Languages usually allow the input of Latin letters and related |
---|
2026 | characters in full-width or half-width versions. For IRI input, the |
---|
2027 | input method editor should be set so that it produces half-width Latin |
---|
2028 | letters and punctuation and full-width Katakana.</t> |
---|
2029 | |
---|
2030 | <t>An input field primarily or solely used for the input of URIs/IRIs |
---|
2031 | might allow the user to view an IRI as it is mapped to a URI. Places |
---|
2032 | where the input of IRIs is frequent may provide the possibility for |
---|
2033 | viewing an IRI as mapped to a URI. This will help users when some of |
---|
2034 | the software they use does not yet accept IRIs.</t> |
---|
2035 | |
---|
2036 | <t>An IRI input component interfacing to components that handle URIs, |
---|
2037 | but not IRIs, must map the IRI to a URI before passing it to these |
---|
2038 | components.</t> |
---|
2039 | |
---|
2040 | <t>For the input of IRIs with right-to-left characters, please see |
---|
2041 | <xref target="bidiInput"></xref>.</t> |
---|
2042 | </section> |
---|
2043 | |
---|
2044 | <section title="URI/IRI Transfer between Applications"> |
---|
2045 | |
---|
2046 | <t>Many applications (for example, mail user agents) try to detect |
---|
2047 | URIs appearing in plain text. For this, they use some heuristics based |
---|
2048 | on URI syntax. They then allow the user to click on such URIs and |
---|
2049 | retrieve the corresponding resource in an appropriate (usually |
---|
2050 | scheme-dependent) application.</t> |
---|
2051 | |
---|
2052 | <t>Such applications would need to be upgraded, in order to use the |
---|
2053 | IRI syntax as a base for heuristics. In particular, a non-ASCII |
---|
2054 | character should not be taken as the indication of the end of an IRI. |
---|
2055 | Such applications also would need to make sure that they correctly |
---|
2056 | convert the detected IRI from the character encoding of the document |
---|
2057 | or application where the IRI appears, to the character encoding used |
---|
2058 | by the system-wide IRI invocation mechanism, or to a URI (according to |
---|
2059 | <xref target="mapping"/>) if the system-wide invocation mechanism only |
---|
2060 | accepts URIs.</t> |
---|
2061 | |
---|
2062 | <t>The clipboard is another frequently used way to transfer URIs and |
---|
2063 | IRIs from one application to another. On most platforms, the clipboard |
---|
2064 | is able to store and transfer text in many languages and scripts. |
---|
2065 | Correctly used, the clipboard transfers characters, not octets, which |
---|
2066 | will do the right thing with IRIs.</t> |
---|
2067 | </section> |
---|
2068 | |
---|
2069 | <section title="URI/IRI Generation"> |
---|
2070 | |
---|
2071 | <t>Systems that offer resources through the Internet, where those |
---|
2072 | resources have logical names, sometimes automatically generate URIs |
---|
2073 | for the resources they offer. For example, some HTTP servers can |
---|
2074 | generate a directory listing for a file directory and then respond to |
---|
2075 | the generated URIs with the files.</t> |
---|
2076 | |
---|
2077 | <t>Many legacy character encodings are in use in various file systems. |
---|
2078 | Many currently deployed systems do not transform the local character |
---|
2079 | representation of the underlying system before generating URIs.</t> |
---|
2080 | |
---|
2081 | <t>For maximum interoperability, systems that generate resource |
---|
2082 | identifiers should make the appropriate transformations. For example, |
---|
2083 | if a file system contains a file named |
---|
2084 | "r&#xE9;sum&#xE9;.html", a server should expose this as |
---|
2085 | "r%C3%A9sum%C3%A9.html" in a URI, which allows use of |
---|
2086 | "r&#xE9;sum&#xE9;.html" in an IRI, even if locally the file |
---|
2087 | name is kept in a character encoding other than UTF-8. |
---|
2088 | </t> |
---|
2089 | |
---|
2090 | <t>This recommendation particularly applies to HTTP servers. For FTP |
---|
2091 | servers, similar considerations apply; see <xref target="RFC2640"/>.</t> |
---|
2092 | </section> |
---|
2093 | |
---|
2094 | <section title="URI/IRI Selection" anchor="selection"> |
---|
2095 | <t>In some cases, resource owners and publishers have control over the |
---|
2096 | IRIs used to identify their resources. This control is mostly |
---|
2097 | executed by controlling the resource names, such as file names, |
---|
2098 | directly.</t> |
---|
2099 | |
---|
2100 | <t>In these cases, it is recommended to avoid choosing IRIs that are |
---|
2101 | easily confused. For example, for US-ASCII, the lower-case ell ("l") is |
---|
2102 | easily confused with the digit one ("1"), and the upper-case oh ("O") is |
---|
2103 | easily confused with the digit zero ("0"). Publishers should avoid |
---|
2104 | confusing users with "br0ken" or "1ame" identifiers.</t> |
---|
2105 | |
---|
2106 | <t>Outside the US-ASCII repertoire, there are many more opportunities for |
---|
2107 | confusion; a complete set of guidelines is too lengthy to include |
---|
2108 | here. As long as names are limited to characters from a single script, |
---|
2109 | native writers of a given script or language will know best when |
---|
2110 | ambiguities can appear, and how they can be avoided. What may look |
---|
2111 | ambiguous to a stranger may be completely obvious to the average |
---|
2112 | native user. On the other hand, in some cases, the UCS contains |
---|
2113 | variants for compatibility reasons; for example, for typographic purposes. |
---|
2114 | These should be avoided wherever possible. Although there may be exceptions, |
---|
2115 | newly created resource names should generally be in NFKC |
---|
2116 | <xref target="UTR15"></xref> (which means that they are also in NFC).</t> |
---|
2117 | |
---|
2118 | <t>As an example, the UCS contains the "fi" ligature at U+FB01 |
---|
2119 | for compatibility reasons. |
---|
2120 | Wherever possible, IRIs should use the two letters "f" and "i" rather |
---|
2121 | than the "fi" ligature. An example where the latter may be used is |
---|
2122 | in the query part of an IRI for an explicit search for a word written |
---|
2123 | containing the "fi" ligature.</t> |
---|
2124 | |
---|
2125 | <t>In certain cases, there is a chance that characters from different |
---|
2126 | scripts look the same. The best known example is the similarity of the |
---|
2127 | Latin "A", the Greek "Alpha", and the Cyrillic "A". To avoid such |
---|
2128 | cases, IRIs should only be created where all the characters in a |
---|
2129 | single component are used together in a given language. This usually |
---|
2130 | means that all of these characters will be from the same script, but |
---|
2131 | there are languages that mix characters from different scripts (such |
---|
2132 | as Japanese). This is similar to the heuristics used to distinguish |
---|
2133 | between letters and numbers in the examples above. Also, for Latin, |
---|
2134 | Greek, and Cyrillic, using lowercase letters results in fewer |
---|
2135 | ambiguities than using uppercase letters would.</t> |
---|
2136 | </section> |
---|
2137 | |
---|
2138 | <section title="Display of URIs/IRIs" anchor="display"> |
---|
2139 | <t> |
---|
2140 | In situations where the rendering software is not expected to display |
---|
2141 | non-ASCII parts of the IRI correctly using the available layout and font |
---|
2142 | resources, these parts should be percent-encoded before being displayed.</t> |
---|
2143 | |
---|
2144 | <t>For display of Bidi IRIs, please see <xref target="visual"/>.</t> |
---|
2145 | </section> |
---|
2146 | |
---|
2147 | <section title="Interpretation of URIs and IRIs"> |
---|
2148 | <t>Software that interprets IRIs as the names of local resources should |
---|
2149 | accept IRIs in multiple forms and convert and match them with the |
---|
2150 | appropriate local resource names.</t> |
---|
2151 | |
---|
2152 | <t>First, multiple representations include both IRIs in the native |
---|
2153 | character encoding of the protocol and also their URI counterparts.</t> |
---|
2154 | |
---|
2155 | <t>Second, it may include URIs constructed based on character |
---|
2156 | encodings other than UTF-8. These URIs may be produced by user agents that do |
---|
2157 | not conform to this specification and that use legacy character encodings to |
---|
2158 | convert non-ASCII characters to URIs. Whether this is necessary, and what |
---|
2159 | character encodings to cover, depends on a number of factors, such as |
---|
2160 | the legacy character encodings used locally and the distribution of |
---|
2161 | various versions of user agents. For example, software for Japanese |
---|
2162 | may accept URIs in Shift_JIS and/or EUC-JP in addition to UTF-8.</t> |
---|
2163 | |
---|
2164 | <t>Third, it may include additional mappings to be more user-friendly |
---|
2165 | and robust against transmission errors. These would be similar to how |
---|
2166 | some servers currently treat URIs as case insensitive or perform |
---|
2167 | additional matching to account for spelling errors. For characters |
---|
2168 | beyond the US-ASCII repertoire, this may, for example, include |
---|
2169 | ignoring the accents on received IRIs or resource names. Please note |
---|
2170 | that such mappings, including case mappings, are language |
---|
2171 | dependent.</t> |
---|
2172 | |
---|
2173 | <t>It can be difficult to identify a resource unambiguously if too |
---|
2174 | many mappings are taken into consideration. However, percent-encoded |
---|
2175 | and not percent-encoded parts of IRIs can always be clearly distinguished. |
---|
2176 | Also, the regularity of UTF-8 (see <xref target="Duerst97"/>) makes the |
---|
2177 | potential for collisions lower than it may seem at first.</t> |
---|
2178 | </section> |
---|
2179 | |
---|
2180 | <section title="Upgrading Strategy"> |
---|
2181 | <t>Where this recommendation places further constraints on software |
---|
2182 | for which many instances are already deployed, it is important to |
---|
2183 | introduce upgrades carefully and to be aware of the various |
---|
2184 | interdependencies.</t> |
---|
2185 | |
---|
2186 | <t>If IRIs cannot be interpreted correctly, they should not be created, |
---|
2187 | generated, or transported. This suggests that upgrading URI interpreting |
---|
2188 | software to accept IRIs should have highest priority.</t> |
---|
2189 | |
---|
2190 | <t>On the other hand, a single IRI is interpreted only by a single or |
---|
2191 | very few interpreters that are known in advance, although it may be |
---|
2192 | entered and transported very widely.</t> |
---|
2193 | |
---|
2194 | <t>Therefore, IRIs benefit most from a broad upgrade of software to be |
---|
2195 | able to enter and transport IRIs. However, before an |
---|
2196 | individual IRI is published, care should be taken to upgrade the corresponding |
---|
2197 | interpreting software in order to cover the forms expected to be |
---|
2198 | received by various versions of entry and transport software.</t> |
---|
2199 | |
---|
2200 | <t>The upgrade of generating software to generate IRIs instead of using a |
---|
2201 | local character encoding should happen only after the service is upgraded |
---|
2202 | to accept IRIs. Similarly, IRIs should only be generated when the service |
---|
2203 | accepts IRIs and the intervening infrastructure and protocol is known |
---|
2204 | to transport them safely.</t> |
---|
2205 | |
---|
2206 | <t>Software converting from URIs to IRIs for display should be upgraded |
---|
2207 | only after upgraded entry software has been widely deployed to the |
---|
2208 | population that will see the displayed result.</t> |
---|
2209 | |
---|
2210 | |
---|
2211 | <t>Where there is a free choice of character encodings, it is often |
---|
2212 | possible to reduce the effort and dependencies for upgrading to IRIs |
---|
2213 | by using UTF-8 rather than another encoding. For example, when a new |
---|
2214 | file-based Web server is set up, using UTF-8 as the character encoding |
---|
2215 | for file names will make the transition to IRIs easier. Likewise, when |
---|
2216 | a new Web form is set up using UTF-8 as the character encoding of the |
---|
2217 | form page, the returned query URIs will use UTF-8 as the character |
---|
2218 | encoding (unless the user, for whatever reason, changes the character |
---|
2219 | encoding) and will therefore be compatible with IRIs.</t> |
---|
2220 | |
---|
2221 | |
---|
2222 | <t>These recommendations, when taken together, will allow for the |
---|
2223 | extension from URIs to IRIs in order to handle characters other than |
---|
2224 | US-ASCII while minimizing interoperability problems. For |
---|
2225 | considerations regarding the upgrade of URI scheme definitions, see |
---|
2226 | <xref target="UTF8use"/>.</t> |
---|
2227 | |
---|
2228 | </section> |
---|
2229 | </section> <!-- guidelines --> |
---|
2230 | |
---|
2231 | <section title="IANA Considerations" anchor="iana"> |
---|
2232 | |
---|
2233 | <t>RFC Editor and IANA note: Please Replace RFC XXXX with the |
---|
2234 | number of this document when it issues as an RFC. </t> |
---|
2235 | |
---|
2236 | <t>IANA maintains a registry of "URI schemes". A "URI scheme" also |
---|
2237 | serves an "IRI scheme". </t> |
---|
2238 | |
---|
2239 | <t>To clarify that the URI scheme registration process also applies to |
---|
2240 | IRIs, change the description of the "URI schemes" registry |
---|
2241 | header to say "[RFC4395] defines an IANA-maintained registry of URI |
---|
2242 | Schemes. These registries include the Permanent and Provisional URI |
---|
2243 | Schemes. RFC XXXX updates this registry to designate that schemes may |
---|
2244 | also indicate their usability as IRI schemes.</t> |
---|
2245 | |
---|
2246 | <t> Update "per RFC 4395" to "per RFC 4395 and RFC XXXX". |
---|
2247 | </t> |
---|
2248 | |
---|
2249 | </section> <!-- IANA --> |
---|
2250 | |
---|
2251 | <section title="Security Considerations" anchor="security"> |
---|
2252 | <t>The security considerations discussed in <xref target="RFC3986"/> |
---|
2253 | also apply to IRIs. In addition, the following issues require |
---|
2254 | particular care for IRIs.</t> |
---|
2255 | <t>Incorrect encoding or decoding can lead to security problems. |
---|
2256 | In particular, some UTF-8 decoders do not check against overlong |
---|
2257 | byte sequences. As an example, a "/" is encoded with the byte 0x2F |
---|
2258 | both in UTF-8 and in US-ASCII, but some UTF-8 decoders also wrongly |
---|
2259 | interpret the sequence 0xC0 0xAF as a "/". A sequence such as "%C0%AF.." |
---|
2260 | may pass some security tests and then be interpreted |
---|
2261 | as "/.." in a path if UTF-8 decoders are fault-tolerant, if conversion |
---|
2262 | and checking are not done in the right order, and/or if reserved |
---|
2263 | characters and unreserved characters are not clearly distinguished.</t> |
---|
2264 | |
---|
2265 | <t>There are various ways in which "spoofing" can occur with IRIs. |
---|
2266 | "Spoofing" means that somebody may add a resource name that looks the |
---|
2267 | same or similar to the user, but that points to a different resource. |
---|
2268 | The added resource may pretend to be the real resource by looking |
---|
2269 | very similar but may contain all kinds of changes that may be |
---|
2270 | difficult to spot and that can cause all kinds of problems. |
---|
2271 | Most spoofing possibilities for IRIs are extensions of those for URIs.</t> |
---|
2272 | |
---|
2273 | <t>Spoofing can occur for various reasons. First, a user's normalization expectations or actual normalization |
---|
2274 | when entering an IRI or transcoding an IRI from a legacy character |
---|
2275 | encoding do not match the normalization used on the |
---|
2276 | server side. Conceptually, this is no different from the problems |
---|
2277 | surrounding the use of case-insensitive web servers. For example, |
---|
2278 | a popular web page with a mixed-case name ("http://big.example.com/PopularPage.html") |
---|
2279 | might be "spoofed" by someone who is able to create "http://big.example.com/popularpage.html". |
---|
2280 | However, the use of unnormalized character sequences, and of additional |
---|
2281 | mappings for user convenience, may increase the chance for spoofing. |
---|
2282 | Protocols and servers that allow the creation of resources with |
---|
2283 | names that are not normalized are particularly vulnerable to such |
---|
2284 | attacks. This is an inherent |
---|
2285 | security problem of the relevant protocol, server, or resource |
---|
2286 | and is not specific to IRIs, but it is mentioned here for completeness.</t> |
---|
2287 | |
---|
2288 | <t>Spoofing can occur in various IRI components, such as the |
---|
2289 | domain name part or a path part. For considerations specific |
---|
2290 | to the domain name part, see <xref target="RFC3491"/>. |
---|
2291 | For the path part, administrators of sites that allow independent |
---|
2292 | users to create resources in the same sub area may have to be careful |
---|
2293 | to check for spoofing.</t> |
---|
2294 | |
---|
2295 | <t>Spoofing can occur because in the UCS many characters look very similar. Details are discussed in <xref target="selection"/>. |
---|
2296 | Again, this is very similar to spoofing possibilities on US-ASCII, |
---|
2297 | e.g., using "br0ken" or "1ame" URIs.</t> |
---|
2298 | |
---|
2299 | <t>Spoofing can occur when URIs with percent-encodings based on various |
---|
2300 | character encodings are accepted to deal with older user agents. In some |
---|
2301 | cases, particularly for Latin-based resource names, this is usually easy to |
---|
2302 | detect because UTF-8-encoded names, when interpreted and viewed as |
---|
2303 | legacy character encodings, produce mostly garbage.</t><t>When |
---|
2304 | concurrently used character encodings have a similar structure but there |
---|
2305 | are no characters that have exactly the same encoding, detection is more |
---|
2306 | difficult.</t> |
---|
2307 | |
---|
2308 | <t>Spoofing can occur with bidirectional IRIs, if the restrictions |
---|
2309 | in <xref target="bidi-structure"/> are not followed. The same visual |
---|
2310 | representation may be interpreted as different logical representations, |
---|
2311 | and vice versa. It is also very important that a correct Unicode bidirectional |
---|
2312 | implementation be used.</t><t>The use of Legacy Extended IRIs introduces additional security issues.</t> |
---|
2313 | </section><!-- security --> |
---|
2314 | |
---|
2315 | <section title="Acknowledgements"> |
---|
2316 | <t>This document was derived from <xref target="RFC3987"/>; the acknowledgments from |
---|
2317 | that specification still apply.</t> |
---|
2318 | <t>We would like to thank Ian Hickson, Michael Sperberg-McQueen, |
---|
2319 | and Dan Connolly for their work on HyperText References, and Norman Walsh, Richard Tobin, |
---|
2320 | Henry S. Thomson, John Cowan, Paul Grosso, and the XML Core Working Group of the W3C for their work on LEIRIs.</t> |
---|
2321 | <t>In addition, this document was influenced by contributions from (in no particular order) Chris |
---|
2322 | Lilley, Bjoern Hoehrmann, |
---|
2323 | Felix Sasaki, Jeremy Carroll, Frank Ellermann, Michael Everson, Cary Karp, Matitiahu Allouche, |
---|
2324 | Richard Ishida, Addison Phillips, Jonathan Rosenne, Najib Tounsi, Debbie Garside, Mark Davis, Sarmad |
---|
2325 | Hussain, Ted Hardie, Konrad Lanz, Thomas Roessler, Lisa Dusseault, Julian Reschke, Giovanni |
---|
2326 | Campagna, Anne van Kesteren, Mark Nottingham, Erik van der Poel, Marcin Hanclik, Marcos Caceres, Roy |
---|
2327 | Fielding, Greg Wilkins, Pieter Hintjens, Daniel R. Tobias, Marko Martin, Maciej Stanchowiak, Wil |
---|
2328 | Tan, Yui Naruse, Michael A. Puls II, Dave Thaler, Tom Perch, John Klensin, Shawn Steele, Peter |
---|
2329 | Saint-Andre, Geoffrey Sneddon, Chris Weber, Alex Melnikov, Slim Amamou, SM, Tim Berners-Lee, Yaron |
---|
2330 | Goland, Sam Ruby, Adam Barth, Abdulrahman I. ALGhadir, Aharon Lanin, Thomas Milo, Murray Sargent, |
---|
2331 | Marc Blanchet, and Mykyta Yevstifeyev.</t> |
---|
2332 | </section> |
---|
2333 | |
---|
2334 | <section title="Main Changes Since RFC 3987"> |
---|
2335 | <t>This section describes the main changes since <xref target="RFC3987"></xref>.</t> |
---|
2336 | <section title="Major restructuring of IRI processing model" anchor="forkChanges"> |
---|
2337 | <t>Major restructuring of IRI processing model to make scheme-specific translation |
---|
2338 | necessary to handle IDNA requirements and for consistency with web implementations. </t> |
---|
2339 | <t>Starting with IRI, you want one of: |
---|
2340 | <list style="hanging"> |
---|
2341 | <t hangText="a"> IRI components (IRI parsed into UTF8 pieces)</t> |
---|
2342 | <t hangText="b"> URI components (URI parsed into ASCII pieces, encoded correctly) </t> |
---|
2343 | <t hangText="c"> whole URI (for passing on to some other system that wants whole URIs) </t> |
---|
2344 | </list></t> |
---|
2345 | |
---|
2346 | <section title="OLD WAY"> |
---|
2347 | <t><list style="numbers"> |
---|
2348 | |
---|
2349 | <t>Pct-encoding on the whole thing to a URI. |
---|
2350 | (c1) If you want a (maybe broken) whole URI, you might |
---|
2351 | stop here.</t> |
---|
2352 | |
---|
2353 | <t>Parsing the URI into URI components. |
---|
2354 | (b1) If you want (maybe broken) URI components, stop here.</t> |
---|
2355 | |
---|
2356 | <t> Decode the components (undoing the pct-encoding). |
---|
2357 | (a) if you want IRI components, stop here.</t> |
---|
2358 | |
---|
2359 | <t> reencode: Either using a different encoding some components |
---|
2360 | (for domain names, and query components in web pages, which |
---|
2361 | depends on the component, scheme and context), and otherwise |
---|
2362 | using pct-encoding. |
---|
2363 | (b2) if you want (good) URI components, stop here.</t> |
---|
2364 | |
---|
2365 | <t> reassemble the reencoded components. |
---|
2366 | (c2) if you want a (*good*) whole URI stop here.</t> |
---|
2367 | </list> |
---|
2368 | |
---|
2369 | </t> |
---|
2370 | |
---|
2371 | </section> |
---|
2372 | |
---|
2373 | <section title="NEW WAY"> |
---|
2374 | <t> |
---|
2375 | <list style="numbers"> |
---|
2376 | |
---|
2377 | <t> Parse the IRI into IRI components using the generic syntax. |
---|
2378 | (a) if you want IRI components, stop here.</t> |
---|
2379 | |
---|
2380 | <t> Encode each components, using pct-encoding, IDN encoding, or |
---|
2381 | special query part encoding depending on the component |
---|
2382 | scheme or context. (b) If you want URI components, stop here.</t> |
---|
2383 | <t> reassemble the a whole URI from URI components. |
---|
2384 | (c) if you want a whole URI stop here.</t> |
---|
2385 | </list></t> |
---|
2386 | </section> |
---|
2387 | <section title="Extension of Syntax"> |
---|
2388 | <t>Added the tag range (U+E0000-E0FFF) to the iprivate production. |
---|
2389 | Some IRIs generated with the new syntax may fail to pass very strict checks |
---|
2390 | relying on the old syntax. But characters in this range should be extremely infrequent |
---|
2391 | anyway.</t> |
---|
2392 | </section> |
---|
2393 | <section title="More to be added"><t>TODO: There are more main changes that need to be |
---|
2394 | documented in this section.</t></section> |
---|
2395 | </section> |
---|
2396 | |
---|
2397 | <section title="Change Log"> |
---|
2398 | |
---|
2399 | <t>Note to RFC Editor: Please completely remove this section before publication.</t> |
---|
2400 | |
---|
2401 | <section title='Changes after draft-ietf-iri-3987bis-01'> |
---|
2402 | <t>Changes from draft-ietf-iri-3987bis-01 onwards are available as changesets |
---|
2403 | in the IETF tools subversion repository at |
---|
2404 | http://trac.tools.ietf.org/wg/iri/trac/log/draft-ietf-iri-3987bis/draft-ietf-iri-3987bis.xml.</t> |
---|
2405 | </section> |
---|
2406 | |
---|
2407 | <section title='Changes from draft-duerst-iri-bis-07 to draft-ietf-iri-3987bis-00'> |
---|
2408 | <t>Changed draft name, date, last paragraph of abstract, and titles in change log, and added this section |
---|
2409 | in moving from draft-duerst-iri-bis-07 (personal submission) to draft-ietf-iri-3987bis-00 (WG document).</t> |
---|
2410 | </section> |
---|
2411 | |
---|
2412 | <section title="Changes from -06 to -07 of draft-duerst-iri-bis"> |
---|
2413 | <t>Major restructuring of the processing model, see <xref target="forkChanges"></xref>.</t> |
---|
2414 | </section> |
---|
2415 | </section> |
---|
2416 | |
---|
2417 | <section title='Changes from -00 to -01'><t><list style="symbols"> |
---|
2418 | <t>Removed 'mailto:' before mail addresses of authors.</t> |
---|
2419 | <t>Added "<to be done>" as right side of 'href-strip' rule. Fixed '|' to '/' for |
---|
2420 | alternatives.</t> |
---|
2421 | </list></t> |
---|
2422 | </section> |
---|
2423 | |
---|
2424 | <section title="Changes from -05 to -06 of draft-duerst-iri-bis-00"><t><list style="symbols"> |
---|
2425 | <t>Add HyperText Reference, change abstract, acks and references for it</t> |
---|
2426 | <t>Add Masinter back as another editor.</t> |
---|
2427 | <t>Masinter integrates HRef material from HTML5 spec.</t> |
---|
2428 | <t>Rewrite introduction sections to modernize.</t> |
---|
2429 | </list></t> |
---|
2430 | </section> |
---|
2431 | |
---|
2432 | <section title="Changes from -04 to -05 of draft-duerst-iri-bis"> |
---|
2433 | <t><list style="symbols"> |
---|
2434 | <t>Updated references.</t> |
---|
2435 | <t>Changed IPR text to pre5378Trust200902.</t></list></t> |
---|
2436 | </section> |
---|
2437 | |
---|
2438 | <section title="Changes from -03 to -04 of draft-duerst-iri-bis"> |
---|
2439 | <t><list style="symbols"> |
---|
2440 | <t>Added explicit abbreviation for LEIRIs.</t> |
---|
2441 | <t>Mentioned LEIRI references.</t> |
---|
2442 | <t>Completed text in LEIRI section about tag characters and about specials.</t></list></t> |
---|
2443 | </section> |
---|
2444 | |
---|
2445 | <section title="Changes from -02 to -03 of draft-duerst-iri-bis"> |
---|
2446 | <t><list style="symbols"> |
---|
2447 | <t>Updated some references.</t> |
---|
2448 | <t>Updated Michel Suginard's coordinates.</t></list></t> |
---|
2449 | </section> |
---|
2450 | |
---|
2451 | <section title="Changes from -01 to -02 of draft-duerst-iri-bis"> |
---|
2452 | <t><list style="symbols"> |
---|
2453 | <t>Added tag range to iprivate (issue private-include-tags-115).</t> |
---|
2454 | <t>Added Specials (U+FFF0-FFFD) to Legacy Extended IRIs.</t></list></t> |
---|
2455 | </section> |
---|
2456 | <section title="Changes from -00 to -01 of draft-duerst-iri-bis"> |
---|
2457 | <t><list style="symbols"> |
---|
2458 | <t>Changed from "IRIs with Spaces/Controls" to "Legacy Extended IRI" |
---|
2459 | based on input from the W3C XML Core WG. |
---|
2460 | Moved the relevant subsections to the back and promoted them to a section.</t> |
---|
2461 | <t>Added some text re. Legacy Extended IRIs to the security section.</t> |
---|
2462 | <t>Added a IANA Consideration Section.</t> |
---|
2463 | <t>Added this Change Log Section.</t> |
---|
2464 | <t>Added a section about "IRIs with Spaces/Controls" (converting from a Note in RFC 3987).</t></list></t> |
---|
2465 | </section> |
---|
2466 | <section title="Changes from RFC 3987 to -00 of draft-duerst-iri-bis"> |
---|
2467 | <t><list> |
---|
2468 | <t>Fixed errata (see http://www.rfc-editor.org/cgi-bin/errataSearch.pl?rfc=3987).</t></list></t> |
---|
2469 | </section> |
---|
2470 | </section> |
---|
2471 | </middle> |
---|
2472 | |
---|
2473 | <back> |
---|
2474 | <references title="Normative References"> |
---|
2475 | |
---|
2476 | <reference anchor="ASCII"> |
---|
2477 | <front> |
---|
2478 | <title>Coded Character Set -- 7-bit American Standard Code for Information |
---|
2479 | Interchange</title> |
---|
2480 | <author> |
---|
2481 | <organization>American National Standards Institute</organization> |
---|
2482 | </author> |
---|
2483 | <date year="1986"/> |
---|
2484 | </front> |
---|
2485 | <seriesInfo name="ANSI" value="X3.4"/> |
---|
2486 | </reference> |
---|
2487 | |
---|
2488 | <reference anchor="ISO10646"> |
---|
2489 | <front> |
---|
2490 | <title>ISO/IEC 10646:2003: Information Technology - |
---|
2491 | Universal Multiple-Octet Coded Character Set (UCS)</title> |
---|
2492 | <author> |
---|
2493 | <organization>International Organization for Standardization</organization> |
---|
2494 | </author> |
---|
2495 | <date month="December" year="2003"/> |
---|
2496 | </front> |
---|
2497 | <seriesInfo name="ISO" value="Standard 10646"/> |
---|
2498 | </reference> |
---|
2499 | |
---|
2500 | &rfc2119; |
---|
2501 | &rfc3490; |
---|
2502 | &rfc3491; |
---|
2503 | &rfc3629; |
---|
2504 | &rfc3986; |
---|
2505 | |
---|
2506 | <reference anchor="STD68"> |
---|
2507 | <front> |
---|
2508 | <title abbrev="ABNF">Augmented BNF for Syntax Specifications: ABNF</title> |
---|
2509 | <author initials="D." surname="Crocker" fullname="Dave Crocker"><organization/></author> |
---|
2510 | <author initials="P." surname="Overell" fullname="Paul Overell"><organization/></author> |
---|
2511 | <date month="January" year="2008"/></front> |
---|
2512 | <seriesInfo name="STD" value="68"/><seriesInfo name="RFC" value="5234"/> |
---|
2513 | </reference> |
---|
2514 | |
---|
2515 | &rfc5890; |
---|
2516 | &rfc5891; |
---|
2517 | |
---|
2518 | <reference anchor="UNIV6"> |
---|
2519 | <front> |
---|
2520 | <title>The Unicode Standard, Version 6.0.0 (Mountain View, CA, The Unicode Consortium, 2011, ISBN 978-1-936213-01-6)</title> |
---|
2521 | <author><organization>The Unicode Consortium</organization></author> |
---|
2522 | <date year="2010" month="October"/> |
---|
2523 | </front> |
---|
2524 | </reference> |
---|
2525 | |
---|
2526 | <reference anchor="UNI9" target="http://www.unicode.org/reports/tr9/tr9-13.html"> |
---|
2527 | <front> |
---|
2528 | <title>The Bidirectional Algorithm</title> |
---|
2529 | <author initials="M." surname="Davis" fullname="Mark Davis"><organization/></author> |
---|
2530 | <date year="2004" month="March"/> |
---|
2531 | </front> |
---|
2532 | <seriesInfo name="Unicode Standard Annex" value="#9"/> |
---|
2533 | </reference> |
---|
2534 | |
---|
2535 | <reference anchor="UTR15" target="http://www.unicode.org/unicode/reports/tr15/tr15-23.html"> |
---|
2536 | <front> |
---|
2537 | <title>Unicode Normalization Forms</title> |
---|
2538 | <author initials="M." surname="Davis" fullname="Mark Davis"><organization/></author> |
---|
2539 | <author initials="M.J." surname="Duerst" fullname="Martin Duerst"><organization/></author> |
---|
2540 | <date year="2008" month="March"/> |
---|
2541 | </front> |
---|
2542 | <seriesInfo name="Unicode Standard Annex" value="#15"/> |
---|
2543 | </reference> |
---|
2544 | |
---|
2545 | </references> |
---|
2546 | |
---|
2547 | <references title="Informative References"> |
---|
2548 | |
---|
2549 | <reference anchor="BidiEx" target="http://www.w3.org/International/iri-edit/BidiExamples"> |
---|
2550 | <front> |
---|
2551 | <title>Examples of bidirectional IRIs</title> |
---|
2552 | <author><organization/></author> |
---|
2553 | <date year="" month=""/> |
---|
2554 | </front> |
---|
2555 | </reference> |
---|
2556 | |
---|
2557 | <reference anchor="CharMod" target="http://www.w3.org/TR/charmod-resid"> |
---|
2558 | <front> |
---|
2559 | <title>Character Model for the World Wide Web: Resource Identifiers</title> |
---|
2560 | <author initials="M." surname="Duerst" fullname="Martin Duerst"><organization/></author> |
---|
2561 | <author initials="F." surname="Yergeau" fullname="Francois Yergeau"><organization/></author> |
---|
2562 | <author initials="R." surname="Ishida" fullname="Richard Ishida"><organization/></author> |
---|
2563 | <author initials="M." surname="Wolf" fullname="Misha Wolf"><organization/></author> |
---|
2564 | <author initials="T." surname="Texin" fullname="Tex Texin"><organization/></author> |
---|
2565 | <date year="2004" month="November" day="25"/> |
---|
2566 | </front> |
---|
2567 | <seriesInfo name="World Wide Web Consortium" value="Candidate Recommendation"/> |
---|
2568 | </reference> |
---|
2569 | |
---|
2570 | <reference anchor="Duerst97" target="http://www.ifi.unizh.ch/mml/mduerst/papers/PDF/IUC11-UTF-8.pdf"> |
---|
2571 | <front> |
---|
2572 | <title>The Properties and Promises of UTF-8</title> |
---|
2573 | <author initials="M.J." surname="Duerst" fullname="Martin Duerst"><organization/></author> |
---|
2574 | <date year="1997" month="September"/> |
---|
2575 | </front> |
---|
2576 | <seriesInfo name="Proc. 11th International Unicode Conference, San Jose" value=""/> |
---|
2577 | </reference> |
---|
2578 | |
---|
2579 | <reference anchor="Gettys" target="http://www.w3.org/DesignIssues/ModelConsequences"> |
---|
2580 | <front> |
---|
2581 | <title>URI Model Consequences</title> |
---|
2582 | <author initials="J." surname="Gettys" fullname="Jim Gettys"><organization/></author> |
---|
2583 | <date month="" year=""/> |
---|
2584 | </front> |
---|
2585 | </reference> |
---|
2586 | |
---|
2587 | <reference anchor="HTML4" target="http://www.w3.org/TR/html401/appendix/notes.html#h-B.2"> |
---|
2588 | <front> |
---|
2589 | <title>HTML 4.01 Specification</title> |
---|
2590 | <author initials="D." surname="Raggett" fullname="Dave Raggett"><organization/></author> |
---|
2591 | <author initials="A." surname="Le Hors" fullname="Arnaud Le Hors"><organization/></author> |
---|
2592 | <author initials="I." surname="Jacobs" fullname="Ian Jacobs"><organization/></author> |
---|
2593 | <date year="1999" month="December" day="24"/> |
---|
2594 | </front> |
---|
2595 | <seriesInfo name="World Wide Web Consortium" value="Recommendation"/> |
---|
2596 | </reference> |
---|
2597 | |
---|
2598 | <reference anchor="LEIRI" target="http://www.w3.org/TR/leiri/"> |
---|
2599 | <front> |
---|
2600 | <title>Legacy extended IRIs for XML resource identification</title> |
---|
2601 | <author initials="H." surname="Thompson" fullname="Henry Thompson"><organization/></author> |
---|
2602 | <author initials="R." surname="Tobin" fullname="Richard Tobin"><organization/></author> |
---|
2603 | <author initials="N." surname="Walsh" fullname="Norman Walsh"><organization/></author> |
---|
2604 | <date year="2008" month="November" day="3"/> |
---|
2605 | |
---|
2606 | </front> |
---|
2607 | <seriesInfo name="World Wide Web Consortium" value="Note"/> |
---|
2608 | </reference> |
---|
2609 | |
---|
2610 | |
---|
2611 | &rfc2045; |
---|
2612 | &rfc2130; |
---|
2613 | &rfc2141; |
---|
2614 | &rfc2192; |
---|
2615 | &rfc2277; |
---|
2616 | &rfc2368; |
---|
2617 | &rfc2384; |
---|
2618 | &rfc2396; |
---|
2619 | &rfc2397; |
---|
2620 | &rfc2616; |
---|
2621 | &rfc1738; |
---|
2622 | &rfc2640; |
---|
2623 | &rfc3987; |
---|
2624 | <reference anchor='RFC4395bis'> |
---|
2625 | <front> |
---|
2626 | <title>Guidelines and Registration Procedures for New URI/IRI Schemes</title> |
---|
2627 | <author initials='T.' surname='Hansen' fullname="Tony Hansen"><organization/></author> |
---|
2628 | <author initials='T.' surname='Hardie' fullname="Ted Hardie"><organization/></author> |
---|
2629 | <author initials='L.' surname='Masinter' fullname="Larry Masinter"><organization/></author> |
---|
2630 | <date year="2010" month='September' day="30"/> |
---|
2631 | <workgroup>IRI</workgroup> |
---|
2632 | </front> |
---|
2633 | <seriesInfo name="Internet-Draft" value="draft-hansen-iri-4395bis-irireg-00"/> |
---|
2634 | </reference> |
---|
2635 | |
---|
2636 | |
---|
2637 | <reference anchor="UNIXML" target="http://www.w3.org/TR/unicode-xml/"> |
---|
2638 | <front> |
---|
2639 | <title>Unicode in XML and other Markup Languages</title> |
---|
2640 | <author initials="M.J." surname="Duerst" fullname="Martin Duerst"><organization/></author> |
---|
2641 | <author initials="A." surname="Freytag" fullname="Asmus Freytag"><organization/></author> |
---|
2642 | <date year="2003" month="June" day="18"/> |
---|
2643 | </front> |
---|
2644 | <seriesInfo name="Unicode Technical Report" value="#20"/> |
---|
2645 | <seriesInfo name="World Wide Web Consortium" value="Note"/> |
---|
2646 | </reference> |
---|
2647 | |
---|
2648 | <reference anchor="UTR36" target="http://unicode.org/reports/tr36/"> |
---|
2649 | <front> |
---|
2650 | <title>Unicode Security Considerations</title> |
---|
2651 | <author initials="M." surname="Davis" fullname="Mark Davis"><organization/></author> |
---|
2652 | <author initials="M." surname="Suignard" fullname="Michel Suignard"><organization/></author> |
---|
2653 | <date year="2010" month="August" day="4"/> |
---|
2654 | </front> |
---|
2655 | <seriesInfo name="Unicode Technical Report" value="#36"/> |
---|
2656 | </reference> |
---|
2657 | |
---|
2658 | <reference anchor="XLink" target="http://www.w3.org/TR/xlink/#link-locators"> |
---|
2659 | <front> |
---|
2660 | <title>XML Linking Language (XLink) Version 1.0</title> |
---|
2661 | <author initials="S." surname="DeRose" fullname="Steve DeRose"><organization/></author> |
---|
2662 | <author initials="E." surname="Maler" fullname="Eve Maler"><organization/></author> |
---|
2663 | <author initials="D." surname="Orchard" fullname="David Orchard"><organization/></author> |
---|
2664 | <date year="2001" month="June" day="27"/> |
---|
2665 | </front> |
---|
2666 | <seriesInfo name="World Wide Web Consortium" value="Recommendation"/> |
---|
2667 | </reference> |
---|
2668 | |
---|
2669 | <reference anchor="XML1" target="http://www.w3.org/TR/REC-xml"> |
---|
2670 | <front> |
---|
2671 | <title>Extensible Markup Language (XML) 1.0 (Forth Edition)</title> |
---|
2672 | <author initials="T." surname="Bray" fullname="Tim Bray"><organization/></author> |
---|
2673 | <author initials="J." surname="Paoli" fullname="Jean Paoli"><organization/></author> |
---|
2674 | <author initials="C.M." surname="Sperberg-McQueen" fullname="C. M. Sperberg-McQueen"> |
---|
2675 | <organization/></author> |
---|
2676 | <author initials="E." surname="Maler" fullname="Eve Maler"><organization/></author> |
---|
2677 | <author initials="F." surname="Yergeau" fullname="Francois Yergeau"><organization/></author> |
---|
2678 | <date day="16" month="August" year="2006"/> |
---|
2679 | </front> |
---|
2680 | <seriesInfo name="World Wide Web Consortium" value="Recommendation"/> |
---|
2681 | </reference> |
---|
2682 | |
---|
2683 | <reference anchor="XMLNamespace" target="http://www.w3.org/TR/REC-xml-names"> |
---|
2684 | <front> |
---|
2685 | <title>Namespaces in XML (Second Edition)</title> |
---|
2686 | <author initials="T." surname="Bray" fullname="Tim Bray"><organization/></author> |
---|
2687 | <author initials="D." surname="Hollander" fullname="Dave Hollander"><organization/></author> |
---|
2688 | <author initials="A." surname="Layman" fullname="Andrew Layman"><organization/></author> |
---|
2689 | <author initials="R." surname="Tobin" fullname="Richard Tobin"><organization></organization></author> |
---|
2690 | <date day="16" month="August" year="2006"/> |
---|
2691 | </front> |
---|
2692 | <seriesInfo name="World Wide Web Consortium" value="Recommendation"/> |
---|
2693 | </reference> |
---|
2694 | |
---|
2695 | <reference anchor="XMLSchema" target="http://www.w3.org/TR/xmlschema-2/#anyURI"> |
---|
2696 | <front> |
---|
2697 | <title>XML Schema Part 2: Datatypes</title> |
---|
2698 | <author initials="P." surname="Biron" fullname="Paul Biron"><organization/></author> |
---|
2699 | <author initials="A." surname="Malhotra" fullname="Ashok Malhotra"><organization/></author> |
---|
2700 | <date year="2001" month="May" day="2"/> |
---|
2701 | </front> |
---|
2702 | <seriesInfo name="World Wide Web Consortium" value="Recommendation"/> |
---|
2703 | </reference> |
---|
2704 | |
---|
2705 | <reference anchor="XPointer" target="http://www.w3.org/TR/xptr-framework/#escaping"> |
---|
2706 | <front> |
---|
2707 | <title>XPointer Framework</title> |
---|
2708 | <author initials="P." surname="Grosso" fullname="Paul Grosso"><organization/></author> |
---|
2709 | <author initials="E." surname="Maler" fullname="Eve Maler"><organization/></author> |
---|
2710 | <author initials="J." surname="Marsh" fullname="Jonathan Marsh"><organization/></author> |
---|
2711 | <author initials="N." surname="Walsh" fullname="Norman Walsh"><organization/></author> |
---|
2712 | <date year="2003" month="March" day="25"/> |
---|
2713 | </front> |
---|
2714 | <seriesInfo name="World Wide Web Consortium" value="Recommendation"/> |
---|
2715 | </reference> |
---|
2716 | |
---|
2717 | <reference anchor="HTML5" target="http://www.w3.org/TR/2009/WD-html5-20090423/"> |
---|
2718 | <front> |
---|
2719 | <title>A vocabulary and associated APIs for HTML and XHTML</title> |
---|
2720 | <author initials="I." surname="Hickson" fullname="Ian Hickson"><organization>Google, Inc.</organization></author> |
---|
2721 | <author initials="D." surname="Hyatt" fullname="David Hyatt"><organization>Apple, Inc.</organization></author> |
---|
2722 | <date year="2009" month="April" day="23"/> |
---|
2723 | </front> |
---|
2724 | <seriesInfo name="World Wide Web Consortium" value="Working Draft"/> |
---|
2725 | </reference> |
---|
2726 | |
---|
2727 | </references> |
---|
2728 | |
---|
2729 | </back> |
---|
2730 | </rfc> |
---|