source: draft-ietf-iri-3987bis/draft-ietf-iri-3987bis.xml @ 39

Last change on this file since 39 was 39, checked in by duerst@…, 9 years ago

Unifying upper-casing in (sub)section titles

  • Property svn:executable set to *
File size: 123.9 KB
Line 
1<?xml version="1.0"?>
2<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
3<!ENTITY rfc1738 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.1738.xml">
4<!ENTITY rfc2045 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2045.xml">
5<!ENTITY rfc2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
6<!ENTITY rfc2130 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2130.xml">
7<!ENTITY rfc2141 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2141.xml">
8<!ENTITY rfc2192 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2192.xml">
9<!ENTITY rfc2277 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2277.xml">
10<!ENTITY rfc2368 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2368.xml">
11<!ENTITY rfc2384 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2384.xml">
12<!ENTITY rfc2396 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2396.xml">
13<!ENTITY rfc2397 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2397.xml">
14<!ENTITY rfc2616 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2616.xml">
15<!ENTITY rfc2640 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2640.xml">
16<!ENTITY rfc3490 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3490.xml">
17<!ENTITY rfc3491 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3491.xml">
18<!ENTITY rfc3629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3629.xml">
19<!ENTITY rfc3986 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3986.xml">
20<!ENTITY rfc3987 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3987.xml">
21<!ENTITY rfc5890 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5890.xml">
22<!ENTITY rfc5891 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5891.xml">
23]>
24<?rfc strict='yes'?>
25
26<?xml-stylesheet type='text/css' href='rfc2629.css' ?>
27<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
28<?rfc symrefs='yes'?>
29<?rfc sortrefs='yes'?>
30<?rfc iprnotified="no" ?>
31<?rfc toc='yes'?>
32<?rfc compact='yes'?>
33<?rfc subcompact='no'?>
34<rfc ipr="pre5378Trust200902" docName="draft-ietf-iri-3987bis-04" category="std" xml:lang="en" obsoletes="3987">
35<front>
36<title abbrev="IRIs">Internationalized Resource Identifiers (IRIs)</title>
37
38  <author initials="M.J." surname="Duerst" fullname='Martin Duerst'>
39    <!-- (Note: Please write "Duerst" with u-umlaut wherever
40      possible, for example as "D&#252;rst" in XML and HTML) -->
41  <organization abbrev="Aoyama Gakuin University">Aoyama Gakuin University</organization>
42  <address>
43  <postal>
44  <street>5-10-1 Fuchinobe</street>
45  <city>Sagamihara</city>
46  <region>Kanagawa</region>
47  <code>229-8558</code>
48  <country>Japan</country>
49  </postal>
50  <phone>+81 42 759 6329</phone>
51  <facsimile>+81 42 759 6495</facsimile>
52  <email>duerst@it.aoyama.ac.jp</email>
53  <uri>http://www.sw.it.aoyama.ac.jp/D%C3%BCrst/<!-- (Note: This is the percent-encoded form of an IRI)--></uri>
54  </address>
55</author>
56
57<author initials="M.L." surname="Suignard" fullname="Michel Suignard">
58   <organization>Unicode Consortium</organization>
59   <address>
60   <postal>
61   <street></street>
62   <street>P.O. Box 391476</street>
63   <city>Mountain View</city>
64   <region>CA</region>
65   <code>94039-1476</code>
66   <country>U.S.A.</country>
67   </postal>
68   <phone>+1-650-693-3921</phone>
69   <email>michel@unicode.org</email>
70   <uri>http://www.suignard.com</uri>
71   </address>
72</author>
73<author initials="L." surname="Masinter" fullname="Larry Masinter">
74   <organization>Adobe</organization>
75   <address>
76   <postal>
77   <street>345 Park Ave</street>
78   <city>San Jose</city>
79   <region>CA</region>
80   <code>95110</code>
81   <country>U.S.A.</country>
82   </postal>
83   <phone>+1-408-536-3024</phone>
84   <email>masinter@adobe.com</email>
85   <uri>http://larry.masinter.net</uri>
86   </address>
87</author>
88
89<date year="2011" month="March" day="14"/>
90<area>Applications</area>
91<workgroup>Internationalized Resource Identifiers (iri)</workgroup>
92<keyword>IRI</keyword>
93<keyword>Internationalized Resource Identifier</keyword>
94<keyword>UTF-8</keyword>
95<keyword>URI</keyword>
96<keyword>URL</keyword>
97<keyword>IDN</keyword>
98<keyword>LEIRI</keyword>
99
100<abstract>
101<t>This document defines the Internationalized Resource Identifier
102(IRI) protocol element, as an extension of the Uniform Resource
103Identifier (URI).  An IRI is a sequence of characters from the
104Universal Character Set (Unicode/ISO 10646). Grammar and processing
105rules are given for IRIs and related syntactic forms.</t>
106
107<t>In addition, this document provides named additional rule sets
108for processing otherwise invalid IRIs, in a way that supports
109other specifications that wish to mandate common behavior for
110'error' handling. In particular, rules used in some XML languages
111(LEIRI) and web applications are given.</t>
112
113<t>Defining IRI as new protocol element (rather than updating or
114extending the definition of URI) allows independent orderly
115transitions: other protocols and languages that use URIs must
116explicitly choose to allow IRIs.</t>
117
118<t>Guidelines are provided for the use and deployment of IRIs and
119related protocol elements when revising protocols, formats, and
120software components that currently deal only with URIs.</t>
121
122</abstract>
123  <note title='RFC Editor: Please remove the next paragraph before publication.'>
124    <t>This document is intended to update RFC 3987 and move towards IETF
125    Draft Standard.  For discussion and comments on this
126    draft, please join the IETF IRI WG by subscribing to the mailing
127    list public-iri@w3.org. For a list of open issues, please see
128    the issue tracker of the WG at http://trac.tools.ietf.org/wg/iri/trac/report/1.</t>
129</note>
130</front>
131<middle>
132
133<section title="Introduction">
134
135<section title="Overview and Motivation" anchor="overview">
136
137<t>A Uniform Resource Identifier (URI) is defined in <xref
138target="RFC3986"/> as a sequence of characters chosen from a limited
139subset of the repertoire of US-ASCII <xref target="ASCII"/>
140characters.</t>
141
142<t>The characters in URIs are frequently used for representing words
143of natural languages.  This usage has many advantages: Such URIs are
144easier to memorize, easier to interpret, easier to transcribe, easier
145to create, and easier to guess. For most languages other than English,
146however, the natural script uses characters other than A - Z. For many
147people, handling Latin characters is as difficult as handling the
148characters of other scripts is for those who use only the Latin
149alphabet. Many languages with non-Latin scripts are transcribed with
150Latin letters. These transcriptions are now often used in URIs, but
151they introduce additional difficulties.</t>
152
153<t>The infrastructure for the appropriate handling of characters from
154additional scripts is now widely deployed in operating system and
155application software. Software that can handle a wide variety of
156scripts and languages at the same time is increasingly common. Also,
157an increasing number of protocols and formats can carry a wide range of
158characters.</t>
159
160<t>URIs are used both as a protocol element (for transmission and
161processing by software) and also a presentation element (for display
162and handling by people who read, interpret, coin, or guess them). The
163transition between these roles is more difficult and complex when
164dealing with the larger set of characters than allowed for URIs in
165<xref target="RFC3986"/>. </t>
166
167<t>This document defines the protocol element called Internationalized
168Resource Identifier (IRI), which allow applications of URIs to be
169extended to use resource identifiers that have a much wider repertoire
170of characters. It also provides corresponding "internationalized"
171versions of other constructs from <xref target="RFC3986"/>, such as
172URI references. The syntax of IRIs is defined in <xref
173target="syntax"/>.
174</t>
175
176<t>Using characters outside of A - Z in IRIs adds a number of
177difficulties. <xref target="Bidi"/> discusses the special case of
178bidirectional IRIs using characters from scripts written
179right-to-left.  <xref target="equivalence"/> discusses various forms
180of equivalence between IRIs. <xref target="IRIuse"/> discusses the use
181of IRIs in different situations.  <xref target="guidelines"/> gives
182additional informative guidelines.  <xref target="security"/>
183discusses IRI-specific security considerations.</t>
184
185  <t>When originally defining IRIs, several design alternatives were considered.
186    Historically interested readers can find an overview in Appendix A of <xref target="RFC3987"/>.
187  For some additional background on the design of URIs and IRIs, please also see
188    <xref target="Gettys"/>.</t>
189</section> <!-- overview -->
190
191<section title="Applicability" anchor="Applicability">
192
193<t>IRIs are designed to allow protocols and software that deal with
194URIs to be updated to handle IRIs. A "URI scheme" (as defined by <xref
195target="RFC3986"/> and registered through the IANA process defined in
196<xref target="RFC4395bis"/> also serves as an "IRI scheme". Processing of
197IRIs is accomplished by extending the URI syntax while retaining (and
198not expanding) the set of "reserved" characters, such that the syntax
199for any URI scheme may be uniformly extended to allow non-ASCII
200characters. In addition, following parsing of an IRI, it is possible
201to construct a corresponding URI by first encoding characters outside
202of the allowed URI range and then reassembling the components.
203</t>
204
205<t>Practical use of IRIs forms in place of URIs forms depends on the
206following conditions being met:</t>
207
208<t><list style="hanging">
209   
210<t hangText="a.">A protocol or format element MUST be explicitly designated to be
211  able to carry IRIs. The intent is to avoid introducing IRIs into
212  contexts that are not defined to accept them.  For example, XML
213  schema <xref target="XMLSchema"/> has an explicit type "anyURI" that
214  includes IRIs and IRI references. Therefore, IRIs and IRI references
215  can be in attributes and elements of type "anyURI".  On the other
216  hand, in the <xref target="RFC2616"/> definition of HTTP/1.1, the
217  Request URI is defined as a URI, which means that direct use of IRIs
218  is not allowed in HTTP requests.</t>
219
220<t hangText="b.">The protocol or format carrying the IRIs MUST have a
221  mechanism to represent the wide range of characters used in IRIs,
222  either natively or by some protocol- or format-specific escaping
223  mechanism (for example, numeric character references in <xref
224  target="XML1"/>).</t>
225
226<t hangText="c.">The URI scheme definition, if it explicitly allows a
227  percent sign ("%") in any syntactic component, SHOULD define the
228  interpretation of sequences of percent-encoded octets (using "%XX"
229  hex octets) as octet from sequences of UTF-8 encoded strings; this
230  is recommended in the guidelines for registering new schemes, <xref
231  target="RFC4395bis"/>.  For example, this is the practice for IMAP URLs
232  <xref target="RFC2192"/>, POP URLs <xref target="RFC2384"/> and the
233  URN syntax <xref target="RFC2141"/>). Note that use of
234  percent-encoding may also be restricted in some situations, for
235  example, URI schemes that disallow percent-encoding might still be
236  used with a fragment identifier which is percent-encoded (e.g.,
237  <xref target="XPointer"/>). See <xref target="UTF8use"/> for further
238  discussion.</t>
239</list></t>
240
241</section> <!-- applicability -->
242
243<section title="Definitions" anchor="sec-Definitions">
244 
245<t>The following definitions are used in this document; they follow the
246terms in <xref target="RFC2130"/>, <xref target="RFC2277"/>, and
247<xref target="ISO10646"/>.</t>
248<t><list style="hanging">
249   
250<t hangText="character:">A member of a set of elements used for the
251    organization, control, or representation of data. For example,
252    "LATIN CAPITAL LETTER A" names a character.</t>
253   
254<t hangText="octet:">An ordered sequence of eight bits considered as a
255    unit.</t>
256   
257<t hangText="character repertoire:">A set of characters (set in the
258    mathematical sense).</t>
259   
260<t hangText="sequence of characters:">A sequence of characters (one
261    after another).</t>
262   
263<t hangText="sequence of octets:">A sequence of octets (one after
264    another).</t>
265   
266<t hangText="character encoding:">A method of representing a sequence
267    of characters as a sequence of octets (maybe with variants). Also,
268    a method of (unambiguously) converting a sequence of octets into a
269    sequence of characters.</t>
270   
271<t hangText="charset:">The name of a parameter or attribute used to
272    identify a character encoding.</t>
273   
274<t hangText="UCS:">Universal Character Set. The coded character set
275    defined by ISO/IEC 10646 <xref target="ISO10646"/> and the Unicode
276    Standard <xref target="UNIV6"/>.</t>
277   
278<t hangText="IRI reference:">Denotes the common usage of an
279    Internationalized Resource Identifier. An IRI reference may be
280    absolute or relative.  However, the "IRI" that results from such a
281    reference only includes absolute IRIs; any relative IRI references
282    are resolved to their absolute form.  Note that in <xref
283    target="RFC2396"/> URIs did not include fragment identifiers, but
284    in <xref target="RFC3986"/> fragment identifiers are part of
285    URIs.</t>
286   
287<t hangText="URL:">The term "URL" was originally used <xref
288   target="RFC1738"/> for roughly what is now called a "URI".  Books,
289   software and documentation often refers to URIs and IRIs using the
290   "URL" term. Some usages restrict "URL" to those URIs which are not
291   URNs. Because of the ambiguity of the term using the term "URL" is
292   NOT RECOMMENDED in formal documents.</t>
293
294<t hangText="LEIRI (Legacy Extended IRI) processing:">  This term was used in
295   various XML specifications to refer
296   to strings that, although not valid IRIs, were acceptable input to
297   the processing rules in <xref target="LEIRIspec" />.</t>
298
299<t hangText="(Web Address, Hypertext Reference, HREF):"> These terms have been
300   added in this document for convenience, to allow other
301   specifications to refer to those strings that, although not valid
302   IRIs, are acceptable input to the processing rules in <xref
303   target="webaddress"/>. This usage corresponds to the parsing rules
304   of some popular web browsing applications.
305   ISSUE: Need to find a good name/abbreviation for these.</t>
306   
307<t hangText="running text:">Human text (paragraphs, sentences,
308   phrases) with syntax according to orthographic conventions of a
309   natural language, as opposed to syntax defined for ease of
310   processing by machines (e.g., markup, programming languages).</t>
311   
312<t hangText="protocol element:">Any portion of a message that affects
313    processing of that message by the protocol in question.</t>
314   
315<t hangText="presentation element:">A presentation form corresponding
316    to a protocol element; for example, using a wider range of
317    characters.</t>
318   
319<t hangText="create (a URI or IRI):">With respect to URIs and IRIs,
320     the term is used for the initial creation. This may be the
321     initial creation of a resource with a certain identifier, or the
322     initial exposition of a resource under a particular
323     identifier.</t>
324   
325<t hangText="generate (a URI or IRI):">With respect to URIs and IRIs,
326     the term is used when the identifier is generated by derivation
327     from other information.</t>
328
329<t hangText="parsed URI component:">When a URI processor parses a URI
330   (following the generic syntax or a scheme-specific syntax, the result
331   is a set of parsed URI components, each of which has a type
332   (corresponding to the syntactic definition) and a sequence of URI
333   characters.  </t>
334
335<t hangText="parsed IRI component:">When an IRI processor parses
336   an IRI directly, following the general syntax or a scheme-specific
337   syntax, the result is a set of parsed IRI components, each of
338   which has a type (corresponding to the syntactice definition)
339   and a sequence of IRI characters. (This definition is analogous
340   to "parsed URI component".)</t>
341
342<t hangText="IRI scheme:">A URI scheme may also be known as
343   an "IRI scheme" if the scheme's syntax has been extended to
344   allow non-US-ASCII characters according to the rules in this
345   document.</t>
346
347</list></t>
348</section> <!-- definitions -->
349<section title="Notation" anchor="sec-Notation">
350     
351<t>RFCs and Internet Drafts currently do not allow any characters
352outside the US-ASCII repertoire. Therefore, this document uses various
353special notations to denote such characters in examples.</t>
354     
355<t>In text, characters outside US-ASCII are sometimes referenced by
356using a prefix of 'U+', followed by four to six hexadecimal
357digits.</t>
358
359<t>To represent characters outside US-ASCII in examples, this document
360uses two notations: 'XML Notation' and 'Bidi Notation'.</t>
361
362<t>XML Notation uses a leading '&amp;#x', a trailing ';', and the
363hexadecimal number of the character in the UCS in between. For
364example, &amp;#x44F; stands for CYRILLIC CAPITAL LETTER YA. In this
365notation, an actual '&amp;' is denoted by '&amp;amp;'.</t>
366
367<t>Bidi Notation is used for bidirectional examples: Lower case
368letters stand for Latin letters or other letters that are written left
369to right, whereas upper case letters represent Arabic or Hebrew
370letters that are written right to left.</t>
371
372<t>To denote actual octets in examples (as opposed to percent-encoded
373octets), the two hex digits denoting the octet are enclosed in "&lt;"
374and "&gt;".  For example, the octet often denoted as 0xc9 is denoted
375here as &lt;c9&gt;.</t>
376
377<t> In this document, the key words "MUST", "MUST NOT", "REQUIRED",
378"SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY",
379and "OPTIONAL" are to be interpreted as described in <xref
380target="RFC2119"/>.</t>
381
382</section> <!-- notation -->
383</section> <!-- introduction -->
384
385<section title="IRI Syntax" anchor="syntax">
386<t>This section defines the syntax of Internationalized Resource
387Identifiers (IRIs).</t>
388
389<t>As with URIs, an IRI is defined as a sequence of characters, not as
390a sequence of octets. This definition accommodates the fact that IRIs
391may be written on paper or read over the radio as well as stored or
392transmitted digitally.  The same IRI might be represented as different
393sequences of octets in different protocols or documents if these
394protocols or documents use different character encodings (and/or
395transfer encodings).  Using the same character encoding as the
396containing protocol or document ensures that the characters in the IRI
397can be handled (e.g., searched, converted, displayed) in the same way
398as the rest of the protocol or document.</t>
399
400<section title="Summary of IRI Syntax" anchor="summary">
401
402<t>IRIs are defined by extending the URI syntax in <xref
403target="RFC3986"/>, but extending the class of unreserved characters
404by adding the characters of the UCS (Universal Character Set, <xref
405target="ISO10646"/>) beyond U+007F, subject to the limitations given
406in the syntax rules below and in <xref target="limitations"/>.</t>
407
408<t>The syntax and use of components and reserved characters is the
409same as that in <xref target="RFC3986"/>. Each "URI scheme" thus also
410functions as an "IRI scheme", in that scheme-specific parsing rules
411for URIs of a scheme are be extended to allow parsing of IRIs using
412the same parsing rules.</t>
413
414<t>All the operations defined in <xref target="RFC3986"/>, such as the
415resolution of relative references, can be applied to IRIs by
416IRI-processing software in exactly the same way as they are for URIs
417by URI-processing software.</t>
418
419<t>Characters outside the US-ASCII repertoire MUST NOT be reserved and
420therefore MUST NOT be used for syntactical purposes, such as to
421delimit components in newly defined schemes. For example, U+00A2, CENT
422SIGN, is not allowed as a delimiter in IRIs, because it is in the
423'iunreserved' category. This is similar to the fact that it is not
424possible to use '-' as a delimiter in URIs, because it is in the
425'unreserved' category.</t>
426
427</section> <!-- summary -->
428<section title="ABNF for IRI References and IRIs" anchor="abnf">
429
430<t>An ABNF definition for IRI references (which are the most general
431concept and the start of the grammar) and IRIs is given here. The
432syntax of this ABNF is described in <xref target="STD68"/>. Character
433numbers are taken from the UCS, without implying any actual binary
434encoding. Terminals in the ABNF are characters, not octets.</t>
435
436<t>The following grammar closely follows the URI grammar in <xref
437target="RFC3986"/>, except that the range of unreserved characters is
438expanded to include UCS characters, with the restriction that private
439UCS characters can occur only in query parts. The grammar is split
440into two parts: Rules that differ from <xref target="RFC3986"/>
441because of the above-mentioned expansion, and rules that are the same
442as those in <xref target="RFC3986"/>. For rules that are different
443than those in <xref target="RFC3986"/>, the names of the non-terminals
444have been changed as follows. If the non-terminal contains 'URI', this
445has been changed to 'IRI'. Otherwise, an 'i' has been prefixed.</t>
446
447<!--
448for line length measuring in artwork (max 72 chars, three chars at start):
449      1         2         3         4         5         6         7
450456789012345678901234567890123456789012345678901234567890123456789012
451-->
452<figure>
453<preamble>The following rules are different from those in <xref target="RFC3986"/>:</preamble>
454<artwork>
455IRI            = scheme ":" ihier-part [ "?" iquery ]
456                 [ "#" ifragment ]
457
458ihier-part     = "//" iauthority ipath-abempty
459               / ipath-absolute
460               / ipath-rootless
461               / ipath-empty
462
463IRI-reference  = IRI / irelative-ref
464
465absolute-IRI   = scheme ":" ihier-part [ "?" iquery ]
466
467irelative-ref  = irelative-part [ "?" iquery ] [ "#" ifragment ]
468
469irelative-part = "//" iauthority ipath-abempty
470               / ipath-absolute
471               / ipath-noscheme
472               / ipath-empty
473
474iauthority     = [ iuserinfo "@" ] ihost [ ":" port ]
475iuserinfo      = *( iunreserved / pct-form / sub-delims / ":" )
476ihost          = IP-literal / IPv4address / ireg-name
477
478pct-form       = pct-encoded
479
480ireg-name      = *( iunreserved / sub-delims )
481
482ipath          = ipath-abempty   ; begins with "/" or is empty
483               / ipath-absolute  ; begins with "/" but not "//"
484               / ipath-noscheme  ; begins with a non-colon segment
485               / ipath-rootless  ; begins with a segment
486               / ipath-empty     ; zero characters
487
488ipath-abempty  = *( path-sep isegment )
489ipath-absolute = path-sep [ isegment-nz *( path-sep isegment ) ]
490ipath-noscheme = isegment-nz-nc *( path-sep isegment )
491ipath-rootless = isegment-nz *( path-sep isegment )
492ipath-empty    = 0&lt;ipchar&gt;
493path-sep       = "/"
494
495isegment       = *ipchar
496isegment-nz    = 1*ipchar
497isegment-nz-nc = 1*( iunreserved / pct-form / sub-delims
498                     / "@" )
499               ; non-zero-length segment without any colon ":"                     
500
501ipchar         = iunreserved / pct-form / sub-delims / ":"
502               / "@"
503 
504iquery         = *( ipchar / iprivate / "/" / "?" )
505
506ifragment      = *( ipchar / "/" / "?" / "#" )
507
508iunreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
509
510ucschar        = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
511               / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
512               / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
513               / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
514               / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
515               / %xD0000-DFFFD / %xE1000-EFFFD
516
517iprivate       = %xE000-F8FF / %xE0000-E0FFF / %xF0000-FFFFD
518               / %x100000-10FFFD
519</artwork>
520</figure>
521
522<t>Some productions are ambiguous. The "first-match-wins" (a.k.a. "greedy")
523algorithm applies. For details, see <xref target="RFC3986"/>.</t>
524
525<figure>
526<preamble>The following rules are the same as those in <xref target="RFC3986"/>:</preamble>
527<artwork>
528scheme         = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
529 
530port           = *DIGIT
531 
532IP-literal     = "[" ( IPv6address / IPvFuture  ) "]"
533 
534IPvFuture      = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
535 
536IPv6address    =                            6( h16 ":" ) ls32
537               /                       "::" 5( h16 ":" ) ls32
538               / [               h16 ] "::" 4( h16 ":" ) ls32
539               / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
540               / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
541               / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
542               / [ *4( h16 ":" ) h16 ] "::"              ls32
543               / [ *5( h16 ":" ) h16 ] "::"              h16
544               / [ *6( h16 ":" ) h16 ] "::"
545               
546h16            = 1*4HEXDIG
547ls32           = ( h16 ":" h16 ) / IPv4address
548
549IPv4address    = dec-octet "." dec-octet "." dec-octet "." dec-octet
550
551dec-octet      = DIGIT                 ; 0-9
552               / %x31-39 DIGIT         ; 10-99
553               / "1" 2DIGIT            ; 100-199
554               / "2" %x30-34 DIGIT     ; 200-249
555               / "25" %x30-35          ; 250-255
556           
557pct-encoded    = "%" HEXDIG HEXDIG
558
559unreserved     = ALPHA / DIGIT / "-" / "." / "_" / "~"
560reserved       = gen-delims / sub-delims
561gen-delims     = ":" / "/" / "?" / "#" / "[" / "]" / "@"
562sub-delims     = "!" / "$" / "&amp;" / "'" / "(" / ")"
563               / "*" / "+" / "," / ";" / "="
564</artwork></figure>
565
566<t>This syntax does not support IPv6 scoped addressing zone identifiers.</t>
567
568</section> <!-- abnf -->
569
570</section> <!-- syntax -->
571
572<section title="Processing IRIs and related protocol elements" anchor="processing">
573
574<t>IRIs are meant to replace URIs in identifying resources within new
575versions of protocols, formats, and software components that use a
576UCS-based character repertoire.  Protocols and components may use and
577process IRIs directly. However, there are still numerous systems and
578protocols which only accept URIs or components of parsed URIs; that is,
579they only accept sequences of characters within the subset of US-ASCII
580characters allowed in URIs. </t>
581
582<t>This section defines specific processing steps for IRI consumers
583which establish the relationship between the string given and the
584interpreted derivatives. These
585processing steps apply to both IRIs and IRI references (i.e., absolute
586or relative forms); for IRIs, some steps are scheme specific. </t>
587
588<section title="Converting to UCS" anchor="ucsconv"> 
589 
590<t>Input that is already in a Unicode form (i.e., a sequence of Unicode
591 characters or an octet-stream representing a Unicode-based character
592 encoding such as UTF-8 or UTF-16) should be left as is and not
593 normalized (see (see <xref target="normalization"/>).</t>
594
595  <t>An IRI or IRI reference is a sequence of characters from the UCS.
596    For IRIs that are not already in a Unicode form
597    (as when written on paper, read aloud, or represented in a text stream
598    using a legacy character encoding), convert the IRI to Unicode.
599    Note that some character encodings or transcriptions can be converted
600    to or represented by more than one sequence of Unicode characters.
601    Ideally the resulting IRI would use a normalized form,
602    such as Unicode Normalization Form C <xref target="UTR15"/>
603    (see <xref target='ladder'/> Normalization and Comparison),
604    since that ensures a stable, consistent representation
605    that is most likely to produce the intended results.
606    Implementers and users are cautioned that, while denormalized character sequences are valid,
607    they might be difficult for other users or processes to reproduce
608    and might lead to unexpected results.
609  </t>
610
611<t> In other cases (written on paper, read aloud, or otherwise
612 represented independent of any character encoding) represent the IRI
613 as a sequence of characters from the UCS normalized according to
614 Unicode Normalization Form C (NFC, <xref target="UTR15"/>).</t>
615</section> <!-- ucsconv -->
616
617<section title="Parse the IRI into IRI components">
618
619<t>Parse the IRI, either as a relative reference (no scheme)
620or using scheme specific processing (according to the scheme
621given); the result resulting in a set of parsed IRI components.
622(NOTE: FIX BEFORE RELEASE: INTENT IS THAT ALL IRI SCHEMES
623THAT USE GENERIC SYNTAX AND ALLOW NON-ASCII AUTHORITY CAN
624ONLY USE AUTHORITY FOR NAMES THAT FOLLOW PUNICODE.)
625 </t>
626
627<t>NOTE: The result of parsing into components will correspond result
628in a correspondence of subtrings of the IRI according to the part
629matched.  For example, in <xref target="HTML5"/>, the protocol
630components of interest are SCHEME (scheme), HOST (ireg-name), PORT
631(port), the PATH (ipath after the initial "/"), QUERY (iquery),
632FRAGMENT (ifragment), and AUTHORITY (iauthority).
633</t>
634
635<t>Subsequent processing rules are sometimes used to define other
636syntactic components. For example, <xref target="HTML5"/> defines APIs
637for IRI processing; in these APIs:
638
639<list style="hanging">
640<t hangText="HOSTSPECIFIC"> the substring that follows
641the substring matched by the iauthority production, or the whole
642string if the iauthority production wasn't matched.</t>
643<t hangText="HOSTPORT"> if there is a scheme component and a port
644component and the port given by the port component is different than
645the default port defined for the protocol given by the scheme
646component, then HOSTPORT is the substring that starts with the
647substring matched by the host production and ends with the substring
648matched by the port production, and includes the colon in between the
649two. Otherwise, it is the same as the host component.
650</t>
651</list>
652</t>
653</section> <!-- parse -->
654
655<section title="General percent-encoding of IRI components" anchor="compmapping">
656   
657<t>For most IRI components, it is possible to map the IRI component
658to an equivalent URI component by percent-encoding those characters
659not allowed in URIs. Previous processing steps will have removed
660some characters, and the interpretation of reserved characters will
661have already been done (with the syntactic reserved characters outside
662of the IRI component). This mapping is defined for all sequences
663of Unicode characters, whether or not they are valid for the component
664in question. </t>
665   
666<t>For each character which is not allowed in a valid URI (NOTE: WHAT
667IS THE RIGHT REFERENCE HERE), apply the following steps. </t>
668
669<t><list style="hanging">
670
671<t hangText="Convert to UTF-8">Convert the character to a sequence of
672  one or more octets using UTF-8 <xref target="RFC3629"/>.</t>
673
674<t hangText="Percent encode">Convert each octet of this sequence to %HH,
675   where HH is the hexadecimal notation of the octet value. The
676   hexadecimal notation SHOULD use uppercase letters. (This is the
677   general URI percent-encoding mechanism in Section 2.1 of <xref
678   target="RFC3986"/>.)</t>
679   
680</list></t>
681
682<t>Note that the mapping is an identity transformation for parsed URI
683components of valid URIs, and is idempotent: applying the mapping a
684second time will not change anything.</t>
685</section> <!-- general conversion -->
686
687<section title="Mapping ireg-name" anchor="dnsmapping">
688
689<t>Schemes that allow non-ASCII based characters
690in the reg-name (ireg-name) position MUST convert the ireg-name
691component of an IRI as follows:</t>
692
693<t>Replace the ireg-name part of the IRI by the part converted using
694the ToASCII operation specified in Section 4.1 of <xref
695target="RFC3490"/> on each dot-separated label, and by using U+002E
696(FULL STOP) as a label separator, with the flag UseSTD3ASCIIRules set
697to FALSE, and with the flag AllowUnassigned set to FALSE.
698The ToASCII operation may
699fail, but this would mean that the IRI cannot be resolved.
700In such cases, if the domain name conversion fails, then the
701entire IRI conversion fails. Processors that have no mechanism for
702signalling a failure MAY instead substitute an otherwise
703invalid host name, although such processing SHOULD be avoided.
704 </t>
705
706<t>For example, the IRI
707<vspace/>"http://r&amp;#xE9;sum&amp;#xE9;.example.org"<vspace/> MAY be
708converted to <vspace/>"http://xn--rsum-bad.example.org"<vspace/>;
709conversion to percent-encoded form, e.g.,
710 <vspace/>"http://r%C3%A9sum%C3%A9.example.org", MUST NOT be performed. </t>
711
712<t><list style="hanging"> 
713
714<t hangText="Note:">Domain Names may appear in parts of an IRI other
715than the ireg-name part.  It is the responsibility of scheme-specific
716implementations (if the Internationalized Domain Name is part of the
717scheme syntax) or of server-side implementations (if the
718Internationalized Domain Name is part of 'iquery') to apply the
719necessary conversions at the appropriate point. Example: Trying to
720validate the Web page at<vspace/>
721http://r&amp;#xE9;sum&amp;#xE9;.example.org would lead to an IRI of
722<vspace/>http://validator.w3.org/check?uri=http%3A%2F%2Fr&amp;#xE9;sum&amp;#xE9;.<vspace/>example.org,
723which would convert to a URI
724of<vspace/>http://validator.w3.org/check?uri=http%3A%2F%2Fr%C3%A9sum%C3%A9.<vspace/>example.org.
725The server-side implementation is responsible for making the
726necessary conversions to be able to retrieve the Web page.</t>
727
728<t hangText="Note:">In this process, characters allowed in URI
729references and existing percent-encoded sequences are not encoded further.
730(This mapping is similar to, but different from, the encoding applied
731when arbitrary content is included in some part of a URI.)
732
733For example, an IRI of
734<vspace/>"http://www.example.org/red%09ros&amp;#xE9;#red"
735(in XML notation) is converted to
736<vspace/>"http://www.example.org/red%09ros%C3%A9#red", not to
737something like
738<vspace/>"http%3A%2F%2Fwww.example.org%2Fred%2509ros%C3%A9%23red".
739((DESIGN QUESTION: What about e.g. http://r%C3%A9sum%C3%A9.example.org in an IRI? Will that get converted to punycode, or not?))
740
741</t>
742
743</list></t>
744</section> <!-- dnsmapping -->
745
746<section title="Mapping query components" anchor="querymapping">
747
748<t>((NOTE: SEE ISSUES LIST))
749
750For compatibility with existing deployed HTTP infrastructure,
751the following special case applies for schemes "http" and "https"
752and IRIs whose origin has a document charset other than one which
753is UCS-based (e.g., UTF-8 or UTF-16). In such a case, the "query"
754component of an IRI is mapped into a URI by using the document
755charset rather than UTF-8 as the binary representation before
756pct-encoding. This mapping is not applied for any other scheme
757or component.</t>
758
759</section> <!-- querymapping -->
760
761<section title="Mapping IRIs to URIs" anchor="mapping">
762
763<t>The canonical mapping from a IRI to URI is defined by applying the
764mapping above (from IRI to URI components) and then reassembling a URI
765from the parsed URI components using the original punctuation that
766delimited the IRI components. </t>
767
768</section> <!-- mapping -->
769
770<section title="Converting URIs to IRIs" anchor="URItoIRI">
771
772<t>In some situations, for presentation and further processing,
773it is desirable to convert a URI into an equivalent IRI in which
774natural characters are represented directly rather than
775percent encoded. Of course, every URI is already an IRI in
776its own right without any conversion, and in general there
777This section gives one such procedure for this conversion.
778</t>
779
780<t>
781The conversion described in this section, if given a valid URI, will
782result in an IRI that maps back to the URI used as an input for the
783conversion (except for potential case differences in percent-encoding
784and for potential percent-encoded unreserved characters).
785
786However, the IRI resulting from this conversion may differ
787from the original IRI (if there ever was one).</t> 
788
789<t>URI-to-IRI conversion removes percent-encodings, but not all
790percent-encodings can be eliminated. There are several reasons for
791this:</t>
792
793<t><list style="hanging">
794
795<t hangText="1.">Some percent-encodings are necessary to distinguish
796    percent-encoded and unencoded uses of reserved characters.</t>
797
798<t hangText="2.">Some percent-encodings cannot be interpreted as sequences
799    of UTF-8 octets.<vspace blankLines="1"/>
800    (Note: The octet patterns of UTF-8 are highly regular.
801    Therefore, there is a very high probability, but no guarantee,
802    that percent-encodings that can be interpreted as sequences of UTF-8
803    octets actually originated from UTF-8. For a detailed discussion,
804    see <xref target="Duerst97"/>.)</t>
805
806<t hangText="3.">The conversion may result in a character that is not
807    appropriate in an IRI. See <xref target="abnf"/>, <xref target="visual"/>,
808      and <xref target="limitations"/> for further details.</t>
809
810<t hangText="4.">IRI to URI conversion has different rules for
811    dealing with domain names and query parameters.</t>
812
813</list></t>
814
815<t>Conversion from a URI to an IRI MAY be done by using the following
816steps:
817
818<list style="hanging">
819<t hangText="1.">Represent the URI as a sequence of octets in
820       US-ASCII.</t>
821
822<t hangText="2.">Convert all percent-encodings ("%" followed by two
823      hexadecimal digits) to the corresponding octets, except those
824      corresponding to "%", characters in "reserved", and characters
825      in US-ASCII not allowed in URIs.</t> 
826
827<t hangText="3.">Re-percent-encode any octet produced in step 2 that
828      is not part of a strictly legal UTF-8 octet sequence.</t>
829
830
831<t hangText="4.">Re-percent-encode all octets produced in step 3 that
832      in UTF-8 represent characters that are not appropriate according
833      to <xref target="abnf"/>, <xref target="visual"/>, and <xref
834      target="limitations"/>.</t> 
835
836<t hangText="5.">Interpret the resulting octet sequence as a sequence
837      of characters encoded in UTF-8.</t>
838
839<t hangText="6.">URIs known to contain domain names in the reg-name
840      component SHOULD convert punycode-encoded domain name labels to
841      the corresponding characters using the ToUnicode procedure. </t>
842</list></t>
843
844<t>This procedure will convert as many percent-encoded characters as
845possible to characters in an IRI. Because there are some choices when
846step 4 is applied (see <xref target="limitations"/>), results may
847vary.</t>
848
849<t>Conversions from URIs to IRIs MUST NOT use any character
850encoding other than UTF-8 in steps 3 and 4, even if it might be
851possible to guess from the context that another character encoding
852than UTF-8 was used in the URI.  For example, the URI
853"http://www.example.org/r%E9sum%E9.html" might with some guessing be
854interpreted to contain two e-acute characters encoded as
855iso-8859-1. It must not be converted to an IRI containing these
856e-acute characters. Otherwise, in the future the IRI will be mapped to
857"http://www.example.org/r%C3%A9sum%C3%A9.html", which is a different
858URI from "http://www.example.org/r%E9sum%E9.html".</t>
859
860<section title="Examples">
861
862<t>This section shows various examples of converting URIs to IRIs.
863Each example shows the result after each of the steps 1 through 6 is
864applied. XML Notation is used for the final result.  Octets are
865denoted by "&lt;" followed by two hexadecimal digits followed by
866"&gt;".</t>
867
868<t>The following example contains the sequence "%C3%BC", which is a
869strictly legal UTF-8 sequence, and which is converted into the actual
870character U+00FC, LATIN SMALL LETTER U WITH DIAERESIS (also known as
871u-umlaut).
872
873<list style="hanging">
874<t hangText="1.">http://www.example.org/D%C3%BCrst</t>
875<t hangText="2.">http://www.example.org/D&lt;c3&gt;&lt;bc&gt;rst</t>
876<t hangText="3.">http://www.example.org/D&lt;c3&gt;&lt;bc&gt;rst</t>
877<t hangText="4.">http://www.example.org/D&lt;c3&gt;&lt;bc&gt;rst</t>
878<t hangText="5.">http://www.example.org/D&amp;#xFC;rst</t>
879<t hangText="6.">http://www.example.org/D&amp;#xFC;rst</t>
880</list>
881</t>
882
883<t>The following example contains the sequence "%FC", which might
884represent U+00FC, LATIN SMALL LETTER U WITH DIAERESIS, in
885the<vspace/>iso-8859-1 character encoding.  (It might represent other
886characters in other character encodings. For example, the octet
887&lt;fc&gt; in iso-8859-5 represents U+045C, CYRILLIC SMALL LETTER
888KJE.)  Because &lt;fc&gt; is not part of a strictly legal UTF-8
889sequence, it is re-percent-encoded in step 3.
890
891
892<list style="hanging">
893<t hangText="1.">http://www.example.org/D%FCrst</t>
894<t hangText="2.">http://www.example.org/D&lt;fc&gt;rst</t>
895<t hangText="3.">http://www.example.org/D%FCrst</t>
896<t hangText="4.">http://www.example.org/D%FCrst</t>
897<t hangText="5.">http://www.example.org/D%FCrst</t>
898<t hangText="6.">http://www.example.org/D%FCrst</t>
899</list>
900</t>
901
902<t>The following example contains "%e2%80%ae", which is the percent-encoded<vspace/>UTF-8
903character encoding of U+202E, RIGHT-TO-LEFT OVERRIDE. <xref target="visual"/>
904forbids the direct use of this character in an IRI. Therefore, the
905corresponding octets are re-percent-encoded in step 4. This example shows
906that the case (upper- or lowercase) of letters used in percent-encodings may not be preserved.
907The example also contains a punycode-encoded domain name label (xn--99zt52a),
908which is not converted.
909
910<list style="hanging">
911<t hangText="1.">http://xn--99zt52a.example.org/%e2%80%ae</t>
912<t hangText="2.">http://xn--99zt52a.example.org/&lt;e2&gt;&lt;80&gt;&lt;ae&gt;</t>
913<t hangText="3.">http://xn--99zt52a.example.org/&lt;e2&gt;&lt;80&gt;&lt;ae&gt;</t>
914<t hangText="4.">http://xn--99zt52a.example.org/%E2%80%AE</t>
915<t hangText="5.">http://xn--99zt52a.example.org/%E2%80%AE</t>
916<t hangText="6.">http://&amp;#x7D0D;&amp;#x8C46;.example.org/%E2%80%AE</t>
917</list></t>
918
919<t>Note that the label "xn--99zt52a" is converted to U+7D0D U+8C46
920(Japanese Natto). ((EDITOR NOTE: There is some inconsistency in this note.))</t>
921
922</section> <!-- examples -->
923</section> <!-- URItoIRI -->
924</section> <!-- processing -->
925<section title="Bidirectional IRIs for Right-to-Left Languages" anchor="Bidi">
926
927<t>Some UCS characters, such as those used in the Arabic and Hebrew
928scripts, have an inherent right-to-left (rtl) writing direction. IRIs
929containing these characters (called bidirectional IRIs or Bidi IRIs)
930require additional attention because of the non-trivial relation
931between logical representation (used for digital representation and
932for reading/spelling) and visual representation (used for
933display/printing).</t>
934
935<t>Because of the complex interaction between the logical representation,
936the visual representation, and the syntax of a Bidi IRI, a balance is
937needed between various requirements.
938The main requirements are<list style="hanging">
939<t hangText="1.">user-predictable conversion between visual and
940    logical representation;</t>
941<t hangText="2.">the ability to include a wide range of characters
942    in various parts of the IRI; and</t>
943<t hangText="3.">minor or no changes or restrictions for
944      implementations.</t>
945</list></t>
946
947<section title="Logical Storage and Visual Presentation" anchor="visual">
948
949<t>When stored or transmitted in digital representation, bidirectional
950IRIs MUST be in full logical order and MUST conform to the IRI syntax
951rules (which includes the rules relevant to their scheme). This
952ensures that bidirectional IRIs can be processed in the same way as
953other IRIs.</t> <t>Bidirectional IRIs MUST be rendered by using the
954Unicode Bidirectional Algorithm <xref target="UNIV6"/>, <xref
955target="UNI9"/>.  Bidirectional IRIs MUST be rendered in the same way
956as they would be if they were in a left-to-right embedding; i.e., as
957if they were preceded by U+202A, LEFT-TO-RIGHT EMBEDDING (LRE), and
958followed by U+202C, POP DIRECTIONAL FORMATTING (PDF).  Setting the
959embedding direction can also be done in a higher-level protocol (e.g.,
960the dir='ltr' attribute in HTML).</t> 
961
962<t>There is no requirement to use the above embedding if the display
963is still the same without the embedding. For example, a bidirectional
964IRI in a text with left-to-right base directionality (such as used for
965English or Cyrillic) that is preceded and followed by whitespace and
966strong left-to-right characters does not need an embedding.  Also, a
967bidirectional relative IRI reference that only contains strong
968right-to-left characters and weak characters and that starts and ends
969with a strong right-to-left character and appears in a text with
970right-to-left base directionality (such as used for Arabic or Hebrew)
971and is preceded and followed by whitespace and strong characters does
972not need an embedding.</t>
973
974<t>In some other cases, using U+200E, LEFT-TO-RIGHT MARK (LRM), may be
975sufficient to force the correct display behavior.  However, the
976details of the Unicode Bidirectional algorithm are not always easy to
977understand. Implementers are strongly advised to err on the side of
978caution and to use embedding in all cases where they are not
979completely sure that the display behavior is unaffected without the
980embedding.</t>
981
982<t>The Unicode Bidirectional Algorithm (<xref target="UNI9"/>, section
9834.3) permits higher-level protocols to influence bidirectional
984rendering. Such changes by higher-level protocols MUST NOT be used if
985they change the rendering of IRIs.</t> 
986
987<t>The bidirectional formatting characters that may be used before or
988after the IRI to ensure correct display are not themselves part of the
989IRI.  IRIs MUST NOT contain bidirectional formatting characters (LRM,
990RLM, LRE, RLE, LRO, RLO, and PDF). They affect the visual rendering of
991the IRI but do not appear themselves. It would therefore not be
992possible to input an IRI with such characters correctly.</t>
993
994</section> <!-- visual -->
995<section title="Bidi IRI Structure" anchor="bidi-structure">
996
997<t>The Unicode Bidirectional Algorithm is designed mainly for running
998text.  To make sure that it does not affect the rendering of
999bidirectional IRIs too much, some restrictions on bidirectional IRIs
1000are necessary. These restrictions are given in terms of delimiters
1001(structural characters, mostly punctuation such as "@", ".", ":",
1002and<vspace/>"/") and components (usually consisting mostly of letters
1003and digits).</t>
1004
1005<t>The following syntax rules from <xref target="abnf"/> correspond to
1006components for the purpose of Bidi behavior: iuserinfo, ireg-name,
1007isegment, isegment-nz, isegment-nz-nc, ireg-name, iquery, and
1008ifragment.</t>
1009
1010<t>Specifications that define the syntax of any of the above
1011components MAY divide them further and define smaller parts to be
1012components according to this document. As an example, the restrictions
1013of <xref target="RFC3490"/> on bidirectional domain names correspond
1014to treating each label of a domain name as a component for schemes
1015with ireg-name as a domain name.  Even where the components are not
1016defined formally, it may be helpful to think about some syntax in
1017terms of components and to apply the relevant restrictions.  For
1018example, for the usual name/value syntax in query parts, it is
1019convenient to treat each name and each value as a component. As
1020another example, the extensions in a resource name can be treated as
1021separate components.</t>
1022
1023<t>For each component, the following restrictions apply:</t>
1024<t>
1025<list style="hanging">
1026
1027<t hangText="1.">A component SHOULD NOT use both right-to-left and
1028  left-to-right characters.</t>
1029
1030<t hangText="2.">A component using right-to-left characters SHOULD
1031  start and end with right-to-left characters.</t>
1032
1033</list></t>
1034
1035<t>The above restrictions are given as "SHOULD"s, rather than as
1036"MUST"s.  For IRIs that are never presented visually, they are not
1037relevant.  However, for IRIs in general, they are very important to
1038ensure consistent conversion between visual presentation and logical
1039representation, in both directions.</t>
1040
1041<t><list style="hanging">
1042
1043<t hangText="Note:">In some components, the above restrictions may
1044  actually be strictly enforced.  For example, <xref
1045  target="RFC3490"></xref> requires that these restrictions apply to
1046  the labels of a host name for those schemes where ireg-name is a
1047  host name.  In some other components (for example, path components)
1048  following these restrictions may not be too difficult.  For other
1049  components, such as parts of the query part, it may be very
1050  difficult to enforce the restrictions because the values of query
1051  parameters may be arbitrary character sequences.</t>
1052
1053</list></t>
1054
1055<t>If the above restrictions cannot be satisfied otherwise, the
1056affected component can always be mapped to URI notation as described
1057in <xref target="compmapping"/>. Please note that the whole component
1058has to be mapped (see also Example 9 below).</t>
1059
1060</section> <!-- bidi-structure -->
1061
1062<section title="Input of Bidi IRIs" anchor="bidiInput">
1063
1064<t>Bidi input methods MUST generate Bidi IRIs in logical order while
1065rendering them according to <xref target="visual"/>.  During input,
1066rendering SHOULD be updated after every new character is input to
1067avoid end-user confusion.</t>
1068
1069</section> <!-- bidiInput -->
1070
1071<section title="Examples">
1072
1073<t>This section gives examples of bidirectional IRIs, in Bidi
1074Notation.  It shows legal IRIs with the relationship between logical
1075and visual representation and explains how certain phenomena in this
1076relationship may look strange to somebody not familiar with
1077bidirectional behavior, but familiar to users of Arabic and Hebrew. It
1078also shows what happens if the restrictions given in <xref
1079target="bidi-structure"/> are not followed. The examples below can be
1080seen at <xref target="BidiEx"/>, in Arabic, Hebrew, and Bidi Notation
1081variants.</t>
1082
1083<t>To read the bidi text in the examples, read the visual
1084representation from left to right until you encounter a block of rtl
1085text. Read the rtl block (including slashes and other special
1086characters) from right to left, then continue at the next unread ltr
1087character.</t>
1088
1089<t>Example 1: A single component with rtl characters is inverted:
1090<vspace/>Logical representation:
1091"http://ab.CDEFGH.ij/kl/mn/op.html"<vspace/>Visual representation:
1092"http://ab.HGFEDC.ij/kl/mn/op.html"<vspace/> Components can be read
1093one by one, and each component can be read in its natural
1094direction.</t>
1095
1096<t>Example 2: More than one consecutive component with rtl characters
1097is inverted as a whole: <vspace/>Logical representation:
1098"http://ab.CDE.FGH/ij/kl/mn/op.html"<vspace/>Visual representation:
1099"http://ab.HGF.EDC/ij/kl/mn/op.html"<vspace/> A sequence of rtl
1100components is read rtl, in the same way as a sequence of rtl words is
1101read rtl in a bidi text.</t>
1102
1103<t>Example 3: All components of an IRI (except for the scheme) are
1104rtl.  All rtl components are inverted overall: <vspace/>Logical
1105representation:
1106"http://AB.CD.EF/GH/IJ/KL?MN=OP;QR=ST#UV"<vspace/>Visual
1107representation: "http://VU#TS=RQ;PO=NM?LK/JI/HG/FE.DC.BA"<vspace/> The
1108whole IRI (except the scheme) is read rtl. Delimiters between rtl
1109components stay between the respective components; delimiters between
1110ltr and rtl components don't move.</t>
1111
1112<t>Example 4: Each of several sequences of rtl components is inverted
1113on its own: <vspace/>Logical representation:
1114"http://AB.CD.ef/gh/IJ/KL.html"<vspace/>Visual representation:
1115"http://DC.BA.ef/gh/LK/JI.html"<vspace/> Each sequence of rtl
1116components is read rtl, in the same way as each sequence of rtl words
1117in an ltr text is read rtl.</t>
1118
1119<t>Example 5: Example 2, applied to components of different kinds:
1120<vspace/>Logical representation: "http://ab.cd.EF/GH/ij/kl.html"
1121<vspace/>Visual representation:
1122"http://ab.cd.HG/FE/ij/kl.html"<vspace/> The inversion of the domain
1123name label and the path component may be unexpected, but it is
1124consistent with other bidi behavior.  For reassurance that the domain
1125component really is "ab.cd.EF", it may be helpful to read aloud the
1126visual representation following the bidi algorithm. After
1127"http://ab.cd." one reads the RTL block "E-F-slash-G-H", which
1128corresponds to the logical representation.
1129</t>
1130
1131<t>Example 6: Same as Example 5, with more rtl components:
1132<vspace/>Logical representation:
1133"http://ab.CD.EF/GH/IJ/kl.html"<vspace/>Visual representation:
1134"http://ab.JI/HG/FE.DC/kl.html"<vspace/> The inversion of the domain
1135name labels and the path components may be easier to identify because
1136the delimiters also move.</t>
1137
1138<t>Example 7: A single rtl component includes digits: <vspace/>Logical
1139representation: "http://ab.CDE123FGH.ij/kl/mn/op.html"<vspace/>Visual
1140representation: "http://ab.HGF123EDC.ij/kl/mn/op.html"<vspace/>
1141Numbers are written ltr in all cases but are treated as an additional
1142embedding inside a run of rtl characters. This is completely
1143consistent with usual bidirectional text.</t>
1144
1145<t>Example 8 (not allowed): Numbers are at the start or end of an rtl
1146component:<vspace/>Logical representation:
1147"http://ab.cd.ef/GH1/2IJ/KL.html"<vspace/>Visual representation:
1148"http://ab.cd.ef/LK/JI1/2HG.html"<vspace/> The sequence "1/2" is
1149interpreted by the bidi algorithm as a fraction, fragmenting the
1150components and leading to confusion. There are other characters that
1151are interpreted in a special way close to numbers; in particular, "+",
1152"-", "#", "$", "%", ",", ".", and ":".</t>
1153
1154<t>Example 9 (not allowed): The numbers in the previous example are
1155percent-encoded: <vspace/>Logical representation:
1156"http://ab.cd.ef/GH%31/%32IJ/KL.html",<vspace/>Visual representation:
1157"http://ab.cd.ef/LK/JI%32/%31HG.html"</t>
1158
1159<t>Example 10 (allowed but not recommended): <vspace/>Logical
1160representation: "http://ab.CDEFGH.123/kl/mn/op.html"<vspace/>Visual
1161representation: "http://ab.123.HGFEDC/kl/mn/op.html"<vspace/>
1162Components consisting of only numbers are allowed (it would be rather
1163difficult to prohibit them), but these may interact with adjacent RTL
1164components in ways that are not easy to predict.</t>
1165
1166<t>Example 11 (allowed but not recommended): <vspace/>Logical
1167representation: "http://ab.CDEFGH.123ij/kl/mn/op.html"<vspace/>Visual
1168representation: "http://ab.123.HGFEDCij/kl/mn/op.html"<vspace/>
1169Components consisting of numbers and left-to-right characters are
1170allowed, but these may interact with adjacent RTL components in ways
1171that are not easy to predict.</t>
1172</section><!-- examples -->
1173</section><!-- bidi -->
1174
1175<section title="Normalization and Comparison" anchor="equivalence">
1176
1177<t><list style="hanging"><t hangText="Note:">The structure and much of
1178  the material for this section is taken from section 6 of <xref
1179  target="RFC3986"></xref>; the differences are due to the specifics
1180  of IRIs.</t></list></t>
1181
1182<t>One of the most common operations on IRIs is simple comparison:
1183Determining whether two IRIs are equivalent, without using the IRIs to
1184access their respective resource(s). A comparison is performed
1185whenever a response cache is accessed, a browser checks its history to
1186color a link, or an XML parser processes tags within a
1187namespace. Extensive normalization prior to comparison of IRIs may be
1188used by spiders and indexing engines to prune a search space or reduce
1189duplication of request actions and response storage.</t>
1190
1191<t>IRI comparison is performed for some particular purpose. Protocols
1192or implementations that compare IRIs for different purposes will often
1193be subject to differing design trade-offs in regards to how much
1194effort should be spent in reducing aliased identifiers. This section
1195describes various methods that may be used to compare IRIs, the
1196trade-offs between them, and the types of applications that might use
1197them.</t>
1198
1199<section title="Equivalence">
1200
1201<t>Because IRIs exist to identify resources, presumably they should be
1202considered equivalent when they identify the same resource. However,
1203this definition of equivalence is not of much practical use, as there
1204is no way for an implementation to compare two resources to determine
1205if they are "the same" unless it has full knowledge or control of
1206them. For this reason, determination of equivalence or difference of
1207IRIs is based on string comparison, perhaps augmented by reference to
1208additional rules provided by URI scheme definitions.  We use the terms
1209"different" and "equivalent" to describe the possible outcomes of such
1210comparisons, but there are many application-dependent versions of
1211equivalence.</t>
1212
1213<t>Even when it is possible to determine that two IRIs are equivalent,
1214IRI comparison is not sufficient to determine whether two IRIs
1215identify different resources. For example, an owner of two different
1216domain names could decide to serve the same resource from both,
1217resulting in two different IRIs. Therefore, comparison methods are
1218designed to minimize false negatives while strictly avoiding false
1219positives.</t>
1220
1221<t>In testing for equivalence, applications should not directly
1222compare relative references; the references should be converted to
1223their respective target IRIs before comparison. When IRIs are compared
1224to select (or avoid) a network action, such as retrieval of a
1225representation, fragment components (if any) should be excluded from
1226the comparison.</t>
1227
1228<t>Applications using IRIs as identity tokens with no relationship to
1229a protocol MUST use the Simple String Comparison (see <xref
1230target="stringcomp"></xref>).  All other applications MUST select one
1231of the comparison practices from the Comparison Ladder (see <xref
1232target="ladder"></xref>.</t>
1233</section> <!-- equivalence -->
1234
1235
1236<section title="Preparation for Comparison">
1237<t>Any kind of IRI comparison REQUIRES that any additional contextual
1238processing is first performed, including undoing higher-level
1239escapings or encodings in the protocol or format that carries an
1240IRI. This preprocessing is usually done when the protocol or format is
1241parsed.</t>
1242
1243<t>Examples of contextual preprocessing steps are described in <xref
1244target="LEIRIHREF"/>. </t>
1245
1246<t>Examples of such escapings or encodings are entities and
1247numeric character references in <xref target="HTML4"></xref> and <xref
1248target="XML1"></xref>. As an example,
1249"http://example.org/ros&amp;eacute;" (in HTML),
1250"http://example.org/ros&amp;#233;" (in HTML or XML), and
1251<vspace/>"http://example.org/ros&amp;#xE9;" (in HTML or XML) are all
1252resolved into what is denoted in this document (see <xref
1253target="sec-Notation"></xref>) as "http://example.org/ros&amp;#xE9;"
1254(the "&amp;#xE9;" here standing for the actual e-acute character, to
1255compensate for the fact that this document cannot contain non-ASCII
1256characters).</t>
1257
1258<t>Similar considerations apply to encodings such as Transfer Codings
1259in HTTP (see <xref target="RFC2616"></xref>) and Content Transfer
1260Encodings in MIME (<xref target="RFC2045"></xref>), although in these
1261cases, the encoding is based not on characters but on octets, and
1262additional care is required to make sure that characters, and not just
1263arbitrary octets, are compared (see <xref
1264target="stringcomp"></xref>).</t>
1265
1266</section> <!-- preparation -->
1267
1268<section title="Comparison Ladder" anchor="ladder">
1269
1270<t>In practice, a variety of methods are used to test IRI
1271equivalence. These methods fall into a range distinguished by the
1272amount of processing required and the degree to which the probability
1273of false negatives is reduced. As noted above, false negatives cannot
1274be eliminated. In practice, their probability can be reduced, but this
1275reduction requires more processing and is not cost-effective for all
1276applications.</t>
1277
1278
1279<t>If this range of comparison practices is considered as a ladder,
1280the following discussion will climb the ladder, starting with
1281practices that are cheap but have a relatively higher chance of
1282producing false negatives, and proceeding to those that have higher
1283computational cost and lower risk of false negatives.</t>
1284
1285<section title="Simple String Comparison" anchor="stringcomp">
1286
1287<t>If two IRIs, when considered as character strings, are identical,
1288then it is safe to conclude that they are equivalent.  This type of
1289equivalence test has very low computational cost and is in wide use in
1290a variety of applications, particularly in the domain of parsing. It
1291is also used when a definitive answer to the question of IRI
1292equivalence is needed that is independent of the scheme used and that
1293can be calculated quickly and without accessing a network. An example
1294of such a case is XML Namespaces (<xref
1295target="XMLNamespace"></xref>).</t>
1296
1297
1298<t>Testing strings for equivalence requires some basic precautions.
1299This procedure is often referred to as "bit-for-bit" or
1300"byte-for-byte" comparison, which is potentially misleading. Testing
1301strings for equality is normally based on pair comparison of the
1302characters that make up the strings, starting from the first and
1303proceeding until both strings are exhausted and all characters are
1304found to be equal, until a pair of characters compares unequal, or
1305until one of the strings is exhausted before the other.</t>
1306
1307<t>This character comparison requires that each pair of characters be
1308put in comparable encoding form. For example, should one IRI be stored
1309in a byte array in UTF-8 encoding form and the second in a UTF-16
1310encoding form, bit-for-bit comparisons applied naively will produce
1311errors. It is better to speak of equality on a character-for-character
1312rather than on a byte-for-byte or bit-for-bit basis.  In practical
1313terms, character-by-character comparisons should be done codepoint by
1314codepoint after conversion to a common character encoding form.
1315
1316When comparing character by character, the comparison function MUST
1317NOT map IRIs to URIs, because such a mapping would create additional
1318spurious equivalences. It follows that an IRI SHOULD NOT be modified
1319when being transported if there is any chance that this IRI might be
1320used in a context that uses Simple String Comparison.</t>
1321
1322
1323<t>False negatives are caused by the production and use of IRI
1324aliases. Unnecessary aliases can be reduced, regardless of the
1325comparison method, by consistently providing IRI references in an
1326already normalized form (i.e., a form identical to what would be
1327produced after normalization is applied, as described below).
1328Protocols and data formats often limit some IRI comparisons to simple
1329string comparison, based on the theory that people and implementations
1330will, in their own best interest, be consistent in providing IRI
1331references, or at least be consistent enough to negate any efficiency
1332that might be obtained from further normalization.</t>
1333</section> <!-- stringcomp -->
1334
1335<section title="Syntax-Based Normalization">
1336
1337<figure><preamble>Implementations may use logic based on the
1338definitions provided by this specification to reduce the probability
1339of false negatives. This processing is moderately higher in cost than
1340character-for-character string comparison. For example, an application
1341using this approach could reasonably consider the following two IRIs
1342equivalent:</preamble>
1343
1344<artwork>
1345   example://a/b/c/%7Bfoo%7D/ros&amp;#xE9;
1346   eXAMPLE://a/./b/../b/%63/%7bfoo%7d/ros%C3%A9
1347</artwork></figure>
1348
1349<t>Web user agents, such as browsers, typically apply this type of IRI
1350normalization when determining whether a cached response is
1351available. Syntax-based normalization includes such techniques as case
1352normalization, character normalization, percent-encoding
1353normalization, and removal of dot-segments.</t>
1354
1355<section title="Case Normalization">
1356
1357<t>For all IRIs, the hexadecimal digits within a percent-encoding
1358triplet (e.g., "%3a" versus "%3A") are case-insensitive and therefore
1359should be normalized to use uppercase letters for the digits A-F.</t>
1360
1361<t>When an IRI uses components of the generic syntax, the component
1362syntax equivalence rules always apply; namely, that the scheme and
1363US-ASCII only host are case insensitive and therefore should be
1364normalized to lowercase. For example, the URI
1365"HTTP://www.EXAMPLE.com/" is equivalent to
1366"http://www.example.com/". Case equivalence for non-ASCII characters
1367in IRI components that are IDNs are discussed in <xref
1368target="schemecomp"></xref>.  The other generic syntax components are
1369assumed to be case sensitive unless specifically defined otherwise by
1370the scheme.</t>
1371
1372<t>Creating schemes that allow case-insensitive syntax components
1373containing non-ASCII characters should be avoided. Case normalization
1374of non-ASCII characters can be culturally dependent and is always a
1375complex operation. The only exception concerns non-ASCII host names
1376for which the character normalization includes a mapping step derived
1377from case folding.</t>
1378
1379</section> <!-- casenorm -->
1380
1381<section title="Character Normalization" anchor="normalization">
1382
1383<t>The Unicode Standard <xref target="UNIV6"></xref> defines various
1384equivalences between sequences of characters for various
1385purposes. Unicode Standard Annex #15 <xref target="UTR15"></xref>
1386defines various Normalization Forms for these equivalences, in
1387particular Normalization Form C (NFC, Canonical Decomposition,
1388followed by Canonical Composition) and Normalization Form KC (NFKC,
1389Compatibility Decomposition, followed by Canonical Composition).</t>
1390
1391<t> IRIs already in Unicode MUST NOT be normalized before parsing or
1392interpreting. In many non-Unicode character encodings, some text
1393cannot be represented directly. For example, the word "Vietnam" is
1394natively written "Vi&amp;#x1EC7;t Nam" (containing a LATIN SMALL
1395LETTER E WITH CIRCUMFLEX AND DOT BELOW) in NFC, but a direct
1396transcoding from the windows-1258 character encoding leads to
1397"Vi&amp;#xEA;&amp;#x323;t Nam" (containing a LATIN SMALL LETTER E WITH
1398CIRCUMFLEX followed by a COMBINING DOT BELOW). Direct transcoding of
1399other 8-bit encodings of Vietnamese may lead to other
1400representations.</t>
1401
1402<t>Equivalence of IRIs MUST rely on the assumption that IRIs are
1403appropriately pre-character-normalized rather than apply character
1404normalization when comparing two IRIs. The exceptions are conversion
1405from a non-digital form, and conversion from a non-UCS-based character
1406encoding to a UCS-based character encoding. In these cases, NFC or a
1407normalizing transcoder using NFC MUST be used for interoperability. To
1408avoid false negatives and problems with transcoding, IRIs SHOULD be
1409created by using NFC. Using NFKC may avoid even more problems; for
1410example, by choosing half-width Latin letters instead of full-width
1411ones, and full-width instead of half-width Katakana.</t>
1412
1413
1414<t>As an example,
1415"http://www.example.org/r&amp;#xE9;sum&amp;#xE9;.html" (in XML
1416Notation) is in NFC. On the other hand,
1417"http://www.example.org/re&amp;#x301;sume&amp;#x301;.html" is not in
1418NFC.</t>
1419
1420<t>The former uses precombined e-acute characters, and the latter uses
1421"e" characters followed by combining acute accents. Both usages are
1422defined as canonically equivalent in <xref target="UNIV6"></xref>.</t>
1423
1424<t><list style="hanging">
1425
1426<t hangText="Note:">
1427Because it is unknown how a particular sequence of characters is being
1428treated with respect to character normalization, it would be
1429inappropriate to allow third parties to normalize an IRI
1430arbitrarily. This does not contradict the recommendation that when a
1431resource is created, its IRI should be as character normalized as
1432possible (i.e., NFC or even NFKC). This is similar to the
1433uppercase/lowercase problems.  Some parts of a URI are case
1434insensitive (for example, the domain name). For others, it is unclear
1435whether they are case sensitive, case insensitive, or something in
1436between (e.g., case sensitive, but with a multiple choice selection if
1437the wrong case is used, instead of a direct negative result).  The
1438best recipe is that the creator use a reasonable capitalization and,
1439when transferring the URI, capitalization never be
1440changed.</t></list></t>
1441
1442<t>Various IRI schemes may allow the usage of Internationalized Domain
1443Names (IDN) <xref target="RFC3490"></xref> either in the ireg-name
1444part or elsewhere. Character Normalization also applies to IDNs, as
1445discussed in <xref target="schemecomp"></xref>.</t>
1446</section> <!-- charnorm -->
1447
1448<section title="Percent-Encoding Normalization">
1449
1450<t>The percent-encoding mechanism (Section 2.1 of <xref
1451target="RFC3986"></xref>) is a frequent source of variance among
1452otherwise identical IRIs. In addition to the case normalization issue
1453noted above, some IRI producers percent-encode octets that do not
1454require percent-encoding, resulting in IRIs that are equivalent to
1455their nonencoded counterparts. These IRIs should be normalized by
1456decoding any percent-encoded octet sequence that corresponds to an
1457unreserved character, as described in section 2.3 of <xref
1458target="RFC3986"></xref>.</t>
1459
1460<t>For actual resolution, differences in percent-encoding (except for
1461the percent-encoding of reserved characters) MUST always result in the
1462same resource.  For example, "http://example.org/~user",
1463"http://example.org/%7euser", and "http://example.org/%7Euser", must
1464resolve to the same resource.</t>
1465
1466<t>If this kind of equivalence is to be tested, the percent-encoding
1467of both IRIs to be compared has to be aligned; for example, by
1468converting both IRIs to URIs (see Section 3.1), eliminating escape
1469differences in the resulting URIs, and making sure that the case of
1470the hexadecimal characters in the percent-encoding is always the same
1471(preferably upper case). If the IRI is to be passed to another
1472application or used further in some other way, its original form MUST
1473be preserved.  The conversion described here should be performed only
1474for local comparison.</t>
1475
1476</section> <!-- pctnorm -->
1477
1478<section title="Path Segment Normalization">
1479
1480<t>The complete path segments "." and ".." are intended only for use
1481within relative references (Section 4.1 of <xref
1482target="RFC3986"></xref>) and are removed as part of the reference
1483resolution process (Section 5.2 of <xref target="RFC3986"></xref>).
1484However, some implementations may incorrectly assume that reference
1485resolution is not necessary when the reference is already an IRI, and
1486thus fail to remove dot-segments when they occur in non-relative
1487paths.  IRI normalizers should remove dot-segments by applying the
1488remove_dot_segments algorithm to the path, as described in Section
14895.2.4 of <xref target="RFC3986"></xref>.</t>
1490
1491</section> <!-- pathnorm -->
1492</section> <!-- ladder -->
1493
1494<section title="Scheme-Based Normalization" anchor="schemecomp">
1495
1496<t>The syntax and semantics of IRIs vary from scheme to scheme, as
1497described by the defining specification for each
1498scheme. Implementations may use scheme-specific rules, at further
1499processing cost, to reduce the probability of false negatives. For
1500example, because the "http" scheme makes use of an authority
1501component, has a default port of "80", and defines an empty path to be
1502equivalent to "/", the following four IRIs are equivalent:</t>
1503
1504<figure><artwork>
1505   http://example.com
1506   http://example.com/
1507   http://example.com:/
1508   http://example.com:80/</artwork></figure>
1509
1510<t>In general, an IRI that uses the generic syntax for authority with
1511an empty path should be normalized to a path of "/". Likewise, an
1512explicit ":port", for which the port is empty or the default for the
1513scheme, is equivalent to one where the port and its ":" delimiter are
1514elided and thus should be removed by scheme-based normalization. For
1515example, the second IRI above is the normal form for the "http"
1516scheme.</t>
1517
1518<t>Another case where normalization varies by scheme is in the
1519handling of an empty authority component or empty host
1520subcomponent. For many scheme specifications, an empty authority or
1521host is considered an error; for others, it is considered equivalent
1522to "localhost" or the end-user's host. When a scheme defines a default
1523for authority and an IRI reference to that default is desired, the
1524reference should be normalized to an empty authority for the sake of
1525uniformity, brevity, and internationalization. If, however, either the
1526userinfo or port subcomponents are non-empty, then the host should be
1527given explicitly even if it matches the default.</t>
1528
1529<t>Normalization should not remove delimiters when their associated
1530component is empty unless it is licensed to do so by the scheme
1531specification. For example, the IRI "http://example.com/?" cannot be
1532assumed to be equivalent to any of the examples above. Likewise, the
1533presence or absence of delimiters within a userinfo subcomponent is
1534usually significant to its interpretation.  The fragment component is
1535not subject to any scheme-based normalization; thus, two IRIs that
1536differ only by the suffix "#" are considered different regardless of
1537the scheme.</t>
1538 
1539<t>Some IRI schemes allow the usage of Internationalized Domain
1540Names (IDN) <xref target='RFC5890'></xref> either in their ireg-name
1541part or elswhere. When in use in IRIs, those names SHOULD
1542conform to the definition of U-Label in <xref
1543target='RFC5890'></xref>. An IRI containing an invalid IDN cannot
1544successfully be resolved. For legibility purposes, they
1545SHOULD NOT be converted into ASCII Compatible Encoding (ACE).</t>
1546
1547<t>Scheme-based normalization may also consider IDN
1548components and their conversions to punycode as equivalent. As an
1549example, "http://r&amp;#xE9;sum&amp;#xE9;.example.org" may be
1550considered equivalent to
1551"http://xn--rsum-bpad.example.org".</t><t>Other scheme-specific
1552normalizations are possible.</t>
1553
1554</section> <!-- schemenorm -->
1555
1556<section title="Protocol-Based Normalization">
1557
1558<t>Substantial effort to reduce the incidence of false negatives is
1559often cost-effective for web spiders. Consequently, they implement
1560even more aggressive techniques in IRI comparison. For example, if
1561they observe that an IRI such as</t>
1562
1563<figure><artwork>
1564   http://example.com/data</artwork></figure>
1565<t>redirects to an IRI differing only in the trailing slash</t>
1566<figure><artwork>
1567   http://example.com/data/</artwork></figure>
1568
1569<t>they will likely regard the two as equivalent in the future.  This
1570kind of technique is only appropriate when equivalence is clearly
1571indicated by both the result of accessing the resources and the common
1572conventions of their scheme's dereference algorithm (in this case, use
1573of redirection by HTTP origin servers to avoid problems with relative
1574references).</t>
1575
1576</section> <!-- protonorm -->
1577</section> <!-- equivalence -->
1578</section> 
1579
1580<section title="Use of IRIs" anchor="IRIuse">
1581
1582<section title="Limitations on UCS Characters Allowed in IRIs" anchor="limitations">
1583
1584<t>This section discusses limitations on characters and character
1585sequences usable for IRIs beyond those given in <xref target="abnf"/>
1586and <xref target="visual"/>. The considerations in this section are
1587relevant when IRIs are created and when URIs are converted to
1588IRIs.</t>
1589
1590<t>
1591
1592<list style="hanging"><t hangText="a.">The repertoire of characters allowed
1593    in each IRI component is limited by the definition of that component.
1594    For example, the definition of the scheme component does not allow
1595    characters beyond US-ASCII.
1596    <vspace blankLines="1"/>
1597    (Note: In accordance with URI practice, generic IRI
1598    software cannot and should not check for such limitations.)</t>
1599
1600<t hangText="b.">The UCS contains many areas of characters for which
1601    there are strong visual look-alikes. Because of the likelihood of
1602    transcription errors, these also should be avoided. This includes
1603    the full-width equivalents of Latin characters, half-width
1604    Katakana characters for Japanese, and many others. It also
1605    includes many look-alikes of "space", "delims", and "unwise",
1606    characters excluded in <xref target="RFC3491"/>.</t>
1607   
1608</list>
1609</t>
1610
1611<t>Additional information is available from <xref target="UNIXML"/>.
1612    <xref target="UNIXML"/> is written in the context of running text
1613    rather than in that of identifiers. Nevertheless, it discusses
1614    many of the categories of characters not appropriate for IRIs.</t>
1615</section> <!-- limitations -->
1616
1617<section title="Software Interfaces and Protocols">
1618
1619<t>Although an IRI is defined as a sequence of characters, software
1620interfaces for URIs typically function on sequences of octets or other
1621kinds of code units. Thus, software interfaces and protocols MUST
1622define which character encoding is used.</t>
1623
1624<t>Intermediate software interfaces between IRI-capable components and
1625URI-only components MUST map the IRIs per <xref target="mapping"/>,
1626when transferring from IRI-capable to URI-only components.
1627
1628This mapping SHOULD be applied as late as possible. It SHOULD NOT be
1629applied between components that are known to be able to handle IRIs.</t>
1630</section> <!-- software -->
1631
1632<section title="Format of URIs and IRIs in Documents and Protocols">
1633
1634<t>Document formats that transport URIs may have to be upgraded to allow
1635the transport of IRIs. In cases where the document as a whole
1636has a native character encoding, IRIs MUST also be encoded in this
1637character encoding and converted accordingly by a parser or interpreter.
1638
1639IRI characters not expressible in the native character encoding SHOULD
1640be escaped by using the escaping conventions of the document format if
1641such conventions are available. Alternatively, they MAY be
1642percent-encoded according to <xref target="mapping"/>. For example, in
1643HTML or XML, numeric character references SHOULD be used. If a
1644document as a whole has a native character encoding and that character
1645encoding is not UTF-8, then IRIs MUST NOT be placed into the document
1646in the UTF-8 character encoding.</t>
1647
1648<t>((UPDATE THIS NOTE)) Note: Some formats already accommodate IRIs,
1649although they use different terminology. HTML 4.0 <xref
1650target="HTML4"/> defines the conversion from IRIs to URIs as
1651error-avoiding behavior. XML 1.0 <xref target="XML1"/>, XLink <xref
1652target="XLink"/>, XML Schema <xref target="XMLSchema"/>, and
1653specifications based upon them allow IRIs. Also, it is expected that
1654all relevant new W3C formats and protocols will be required to handle
1655IRIs <xref target="CharMod"/>.</t>
1656
1657</section> <!-- format -->
1658
1659<section title="Use of UTF-8 for Encoding Original Characters" anchor="UTF8use">
1660
1661<t>This section discusses details and gives examples for point c) in
1662<xref target="Applicability"/>. To be able to use IRIs, the URI
1663corresponding to the IRI in question has to encode original characters
1664into octets by using UTF-8.  This can be specified for all URIs of a
1665URI scheme or can apply to individual URIs for schemes that do not
1666specify how to encode original characters.  It can apply to the whole
1667URI, or only to some part. For background information on encoding
1668characters into URIs, see also Section 2.5 of <xref
1669target="RFC3986"/>.</t>
1670
1671<t>For new URI schemes, using UTF-8 is recommended in <xref
1672target="RFC4395bis"/>.  Examples where UTF-8 is already used are the URN
1673syntax <xref target="RFC2141"/>, IMAP URLs <xref target="RFC2192"/>,
1674and POP URLs <xref target="RFC2384"/>.  On the other hand, because the
1675HTTP URI scheme does not specify how to encode original characters,
1676only some HTTP URLs can have corresponding but different IRIs.</t>
1677
1678<t>For example, for a document with a URI
1679of<vspace/>"http://www.example.org/r%C3%A9sum%C3%A9.html", it is
1680possible to construct a corresponding IRI (in XML notation, see <xref
1681target="sec-Notation"/>):
1682"http://www.example.org/r&amp;#xE9;sum&amp;#xE9;.html" ("&amp;#xE9;"
1683stands for the e-acute character, and "%C3%A9" is the UTF-8 encoded
1684and percent-encoded representation of that character). On the other
1685hand, for a document with a URI of
1686"http://www.example.org/r%E9sum%E9.html", the percent-encoding octets
1687cannot be converted to actual characters in an IRI, as the
1688percent-encoding is not based on UTF-8.</t>
1689
1690<t>For most URI schemes, there is no need to upgrade their scheme
1691definition in order for them to work with IRIs.  The main case where
1692upgrading makes sense is when a scheme definition, or a particular
1693component of a scheme, is strictly limited to the use of US-ASCII
1694characters with no provision to include non-ASCII characters/octets
1695via percent-encoding, or if a scheme definition currently uses highly
1696scheme-specific provisions for the encoding of non-ASCII characters.
1697An example of this is the mailto: scheme <xref target="RFC2368"/>.</t>
1698
1699<t>This specification updates the IANA registry of URI schemes to note
1700their applicability to IRIs, see <xref target="iana"/>.  All IRIs use
1701URI schemes, and all URIs with URI schemes can be used as IRIs, even
1702though in some cases only by using URIs directly as IRIs, without any
1703conversion.</t>
1704
1705<t>Scheme definitions can impose restrictions on the syntax of
1706scheme-specific URIs; i.e., URIs that are admissible under the generic
1707URI syntax <xref target="RFC3986"/> may not be admissible due to
1708narrower syntactic constraints imposed by a URI scheme
1709specification. URI scheme definitions cannot broaden the syntactic
1710restrictions of the generic URI syntax; otherwise, it would be
1711possible to generate URIs that satisfied the scheme-specific syntactic
1712constraints without satisfying the syntactic constraints of the
1713generic URI syntax. However, additional syntactic constraints imposed
1714by URI scheme specifications are applicable to IRI, as the
1715corresponding URI resulting from the mapping defined in <xref
1716target="mapping"/> MUST be a valid URI under the syntactic
1717restrictions of generic URI syntax and any narrower restrictions
1718imposed by the corresponding URI scheme specification.</t>
1719
1720<t>The requirement for the use of UTF-8 generally applies to all parts
1721of a URI.  However, it is possible that the capability of IRIs to
1722represent a wide range of characters directly is used just in some
1723parts of the IRI (or IRI reference). The other parts of the IRI may
1724only contain US-ASCII characters, or they may not be based on
1725UTF-8. They may be based on another character encoding, or they may
1726directly encode raw binary data (see also <xref
1727target="RFC2397"/>). </t>
1728
1729<t>For example, it is possible to have a URI reference
1730of<vspace/>"http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9",
1731where the document name is encoded in iso-8859-1 based on server
1732settings, but where the fragment identifier is encoded in UTF-8 according
1733to <xref target="XPointer"/>. The IRI corresponding to the above
1734URI would be (in XML notation)<vspace/>"http://www.example.org/r%E9sum%E9.xml#r&amp;#xE9;sum&amp;#xE9;".</t>
1735
1736<t>Similar considerations apply to query parts. The functionality
1737of IRIs (namely, to be able to include non-ASCII characters) can
1738only be used if the query part is encoded in UTF-8.</t>
1739
1740</section> <!-- utf8 -->
1741
1742<section title="Relative IRI References">
1743<t>Processing of relative IRI references against a base is handled
1744straightforwardly; the algorithms of <xref target="RFC3986"/> can
1745be applied directly, treating the characters additionally allowed
1746in IRI references in the same way that unreserved characters are in URI
1747references.</t>
1748
1749</section> <!-- relative -->
1750</section> <!-- IRIuse -->
1751
1752<section title="Liberal Handling of Otherwise Invalid IRIs" anchor="LEIRIHREF">
1753
1754<t>(EDITOR NOTE: This Section may move to an appendix.)
1755 
1756Some technical specifications and widely-deployed software have
1757allowed additional variations and extensions of IRIs to be used in
1758syntactic components. This section describes two widely-used
1759preprocessing agreements. Other technical specifications may wish to
1760reference a syntactic component which is "a valid IRI or a string that
1761will map to a valid IRI after this preprocessing algorithm". These two
1762variants are known as <xref target="LEIRI">Legacy Extended IRI or
1763LEIRI</xref>, and <xref target="HTML5">Web Address</xref>).
1764</t>
1765
1766<t>Future technical specifications SHOULD NOT allow conforming
1767producers to produce, or conforming content to contain, such forms,
1768as they are not interoperable with other IRI consuming software.</t>
1769
1770<section title="LEIRI Processing"  anchor="LEIRIspec">
1771  <t>This section defines Legacy Extended IRIs (LEIRIs).
1772    The syntax of Legacy Extended IRIs is the same as that for &lt;IRI-reference>,
1773    except that the ucschar production is replaced by the leiri-ucschar production:</t>
1774<figure>
1775
1776<artwork>
1777  leiri-ucschar  = " " / "&lt;" / "&gt;" / '"' / "{" / "}" / "|"
1778                   / "\" / "^" / "`" / %x0-1F / %x7F-D7FF
1779                   / %xE000-FFFD / %x10000-10FFFF
1780</artwork>
1781
1782<postamble>
1783  Among other extensions, processors based on this specification also
1784  did not enforce the restriction on bidirectional formatting
1785  characters in <xref target="visual"></xref>, and the iprivate
1786  production becomes redundant.</postamble>
1787</figure>
1788
1789<t>To convert a string allowed as a LEIRI to an IRI, each character
1790allowed in leiri-ucschar but not in ucschar must be percent-encoded
1791using <xref target="compmapping"/>.</t>
1792</section> <!-- leiriproc -->
1793
1794<section title="Web Address Processing" anchor="webaddress">
1795
1796<t>Many popular web browsers have taken the approach of being quite
1797liberal in what is accepted as a "URL" or its relative
1798forms. This section describes their behavior in terms of a preprocessor
1799which maps strings into the IRI space for subsequent parsing and
1800interpretation as an IRI.</t>
1801
1802<t>In some situations, it might be appropriate to describe the syntax
1803that a liberal consumer implementation might accept as a "Web
1804Address" or "Hypertext Reference" or "HREF". However,
1805technical specifications SHOULD restrict the syntactic form allowed by compliant producers
1806to the IRI or IRI reference syntax defined in this document
1807even if they want to mandate this processing.</t>
1808
1809<t>
1810Summary:
1811<list style="symbols">
1812   <t>Leading and trailing whitespace is removed.</t>
1813   <t>Some additional characters are removed.</t>
1814   <t>Some additional characters are allowed and escaped (as with LEIRI).</t>
1815   <t>If interpreting an IRI as a URI, the pct-encoding of the query
1816   component of the parsed URI component depends on operational
1817   context.</t>
1818</list>
1819</t>
1820
1821<t>Each string provided may have an associated charset (called
1822the HREF-charset here); this defaults to UTF-8.
1823For web browsers interpreting HTML, the document
1824charset of a string is determined:
1825
1826<list style="hanging">
1827<t hangText="If the string came from a script (e.g. as an argument to
1828 a method)">The HRef-charset is the script's charset.</t>
1829
1830<t hangText="If the string came from a DOM node (e.g. from an
1831  element)">The node has a Document, and the HRef-charset is the
1832  Document's character encoding.</t>
1833
1834<t hangText="If the string had a HRef-charset defined when the string was
1835created or defined">The HRef-charset is as defined.</t>
1836
1837</list></t>
1838
1839<t>If the resulting HRef-charset is a unicode based character encoding
1840(e.g., UTF-16), then use UTF-8 instead.</t>
1841
1842
1843<figure>
1844<preamble>The syntax for Web Addresses is obtained by replacing the 'ucschar',
1845  pct-form, and path-sep rules with the href-ucschar, href-pct-form, and href-path-sep
1846  rules below. In addition, some characters are stripped.</preamble>
1847
1848<artwork type='abnf'>
1849  href-ucschar  = " " / "&lt;" / "&gt;" / DQUOTE / "{" / "}" / "|"
1850                   / "\" / "^" / "`" / %x0-1F / %x7F-D7FF
1851                   / %xE000-FFFD / %x10000-10FFFF
1852  href-pct-form = pct-encoded / "%"
1853  href-path-sep = "/" / "\"
1854  href-strip    = &lt;to be done&gt;
1855</artwork>
1856
1857<postamble>
1858(NOTE: NEED TO FIX THESE SETS TO MATCH HTML5; NOT SURE ABOUT NEXT SENTENCE)
1859browsers did not enforce the restriction on bidirectional formatting
1860  characters in <xref target="visual"></xref>, and the iprivate
1861  production becomes redundant.</postamble>
1862</figure>
1863
1864<t>'Web Address processing' requires the following additional
1865preprocessing steps:
1866
1867<list style="numbers">
1868
1869<t>Leading and trailing instances of space (U+0020),
1870CR (U+000A), LF (U+000D), and TAB (U+0009) characters are removed.</t>
1871
1872<t>strip all characters in href-strip.</t>
1873  <t>Percent-encode all characters in href-ucschar not in ucschar.</t>
1874  <t>Replace occurrences of "%" not followed by two hexadecimal digits by "%25".</t>
1875  <t>Convert backslashes ('\') matching href-path-sep to forward slashes ('/').</t>
1876</list></t>
1877</section> <!-- webaddress -->
1878
1879<section title="Characters Not Allowed in IRIs" anchor="notAllowed">
1880
1881<t>This section provides a list of the groups of characters and code
1882points that are allowed by LEIRI or HREF but are not allowed in IRIs or are
1883allowed in IRIs only in the query part. For each group of characters,
1884advice on the usage of these characters is also given, concentrating
1885on the reasons for why they are excluded from IRI use.</t>
1886
1887<t>
1888
1889<list><t>Space (U+0020): Some formats and applications use space as a
1890delimiter, e.g. for items in a list. Appendix C of <xref
1891target="RFC3986"></xref> also mentions that white space may have to be
1892added when displaying or printing long URIs; the same applies to long
1893IRIs. This means that spaces can disappear, or can make the what is
1894intended as a single IRI or IRI reference to be treated as two or more
1895separate IRIs.</t>
1896
1897<t>Delimiters "&lt;" (U+003C), "&gt;" (U+003E), and '"' (U+0022):
1898Appendix C of <xref target="RFC3986"></xref> suggests the use of
1899double-quotes ("http://example.com/") and angle brackets
1900(&lt;http://example.com/&gt;) as delimiters for URIs in plain
1901text. These conventions are often used, and also apply to IRIs.  Using
1902these characters in strings intended to be IRIs would result in the
1903IRIs being cut off at the wrong place.</t>
1904
1905<t>Unwise characters "\" (U+005C), "^" (U+005E), "`"
1906(U+0060), "{" (U+007B), "|" (U+007C), and "}" (U+007D): These
1907characters originally have been excluded from URIs because the
1908respective codepoints are assigned to different graphic characters in
1909some 7-bit or 8-bit encoding. Despite the move to Unicode, some of
1910these characters are still occasionally displayed differently on some
1911systems, e.g. U+005C may appear as a Japanese Yen symbol on some
1912systems. Also, the fact that these characters are not used in URIs or
1913IRIs has encouraged their use outside URIs or IRIs in contexts that
1914may include URIs or IRIs. If a string with such a character were used
1915as an IRI in such a context, it would likely be interpreted
1916piecemeal.</t>
1917
1918<t>The controls (C0 controls, DEL, and C1 controls, #x0 - #x1F #x7F -
1919#x9F): There is generally no way to transmit these characters reliably
1920as text outside of a charset encoding.  Even when in encoded form,
1921many software components silently filter out some of these characters,
1922or may stop processing alltogether when encountering some of
1923them. These characters may affect text display in subtle, unnoticable
1924ways or in drastic, global, and irreversible ways depending on the
1925hardware and software involved. The use of some of these characters
1926would allow malicious users to manipulate the display of an IRI and
1927its context in many situations.</t>
1928
1929<t>Bidi formatting characters (U+200E, U+200F, U+202A-202E): These
1930characters affect the display ordering of characters. If IRIs were
1931allowed to contain these characters and the resulting visual display
1932transcribed. they could not be converted back to electronic form
1933(logical order) unambiguously. These characters, if allowed in IRIs,
1934might allow malicious users to manipulate the display of IRI and its
1935context.</t>
1936
1937<t>Specials (U+FFF0-FFFD): These code points provide functionality
1938beyond that useful in an IRI, for example byte order identification,
1939annotation, and replacements for unknown characters and objects. Their
1940use and interpretation in an IRI would serve no purpose and might lead
1941to confusing display variations.</t>
1942
1943<t>Private use code points (U+E000-F8FF, U+F0000-FFFFD,
1944U+100000-10FFFD): Display and interpretation of these code points is
1945by definition undefined without private agreement. Therefore, these
1946code points are not suited for use on the Internet. They are not
1947interoperable and may have unpredictable effects.</t>
1948
1949<t>Tags (U+E0000-E0FFF): These characters provide a way to language
1950tag in Unicode plain text. They are not appropriate for IRIs because
1951language information in identifiers cannot reliably be input,
1952transmitted (e.g. on a visual medium such as paper), or
1953recognized.</t>
1954
1955<t>Non-characters (U+FDD0-FDEF, U+1FFFE-1FFFF, U+2FFFE-2FFFF,
1956U+3FFFE-3FFFF, U+4FFFE-4FFFF, U+5FFFE-5FFFF, U+6FFFE-6FFFF,
1957U+7FFFE-7FFFF, U+8FFFE-8FFFF, U+9FFFE-9FFFF, U+AFFFE-AFFFF,
1958U+BFFFE-BFFFF, U+CFFFE-CFFFF, U+DFFFE-DFFFF, U+EFFFE-EFFFF,
1959U+FFFFE-FFFFF, U+10FFFE-10FFFF): These code points are defined as
1960non-characters. Applications may use some of them internally, but are
1961not prepared to interchange them.</t>
1962
1963</list></t>
1964
1965<t>LEIRI preprocessing disallowed some code points and
1966code units:
1967
1968<list><t>Surrogate code units (D800-DFFF): These do not represent
1969Unicode codepoints.</t></list></t>
1970</section> <!-- notallowed -->
1971</section> <!-- lieirihref -->
1972 
1973<section title="URI/IRI Processing Guidelines (Informative)" anchor="guidelines">
1974
1975<t>This informative section provides guidelines for supporting IRIs in
1976the same software components and operations that currently process
1977URIs: Software interfaces that handle URIs, software that allows users
1978to enter URIs, software that creates or generates URIs, software that
1979displays URIs, formats and protocols that transport URIs, and software
1980that interprets URIs. These may all require modification before
1981functioning properly with IRIs. The considerations in this section
1982also apply to URI references and IRI references.</t>
1983
1984<section title="URI/IRI Software Interfaces">
1985<t>Software interfaces that handle URIs, such as URI-handling APIs and
1986protocols transferring URIs, need interfaces and protocol elements
1987that are designed to carry IRIs.</t>
1988
1989<t>In case the current handling in an API or protocol is based on
1990US-ASCII, UTF-8 is recommended as the character encoding for IRIs, as
1991it is compatible with US-ASCII, is in accordance with the
1992recommendations of <xref target="RFC2277"/>, and makes converting to
1993URIs easy. In any case, the API or protocol definition must clearly
1994define the character encoding to be used.</t>
1995
1996<t>The transfer from URI-only to IRI-capable components requires no
1997mapping, although the conversion described in <xref
1998target="URItoIRI"/> above may be performed. It is preferable not to
1999perform this inverse conversion unless it is certain this can be done
2000correctly.</t>
2001</section>
2002
2003<section title="URI/IRI Entry">
2004
2005<t>Some components allow users to enter URIs into the system
2006by typing or dictation, for example. This software must be updated to allow
2007for IRI entry.</t>
2008
2009<t>A person viewing a visual representation of an IRI (as a sequence
2010of glyphs, in some order, in some visual display) or hearing an IRI
2011will use an entry method for characters in the user's language to
2012input the IRI. Depending on the script and the input method used, this
2013may be a more or less complicated process.</t>
2014
2015<t>The process of IRI entry must ensure, as much as possible, that the
2016restrictions defined in <xref target="abnf"/> are met. This may be
2017done by choosing appropriate input methods or variants/settings
2018thereof, by appropriately converting the characters being input, by
2019eliminating characters that cannot be converted, and/or by issuing a
2020warning or error message to the user.</t>
2021
2022<t>As an example of variant settings, input method editors for East
2023Asian Languages usually allow the input of Latin letters and related
2024characters in full-width or half-width versions. For IRI input, the
2025input method editor should be set so that it produces half-width Latin
2026letters and punctuation and full-width Katakana.</t>
2027
2028<t>An input field primarily or solely used for the input of URIs/IRIs
2029might allow the user to view an IRI as it is mapped to a URI.  Places
2030where the input of IRIs is frequent may provide the possibility for
2031viewing an IRI as mapped to a URI. This will help users when some of
2032the software they use does not yet accept IRIs.</t>
2033
2034<t>An IRI input component interfacing to components that handle URIs,
2035but not IRIs, must map the IRI to a URI before passing it to these
2036components.</t>
2037
2038<t>For the input of IRIs with right-to-left characters, please see
2039<xref target="bidiInput"></xref>.</t>
2040</section>
2041
2042<section title="URI/IRI Transfer between Applications">
2043
2044<t>Many applications (for example, mail user agents) try to detect
2045URIs appearing in plain text. For this, they use some heuristics based
2046on URI syntax. They then allow the user to click on such URIs and
2047retrieve the corresponding resource in an appropriate (usually
2048scheme-dependent) application.</t>
2049
2050<t>Such applications would need to be upgraded, in order to use the
2051IRI syntax as a base for heuristics. In particular, a non-ASCII
2052character should not be taken as the indication of the end of an IRI.
2053Such applications also would need to make sure that they correctly
2054convert the detected IRI from the character encoding of the document
2055or application where the IRI appears, to the character encoding used
2056by the system-wide IRI invocation mechanism, or to a URI (according to
2057<xref target="mapping"/>) if the system-wide invocation mechanism only
2058accepts URIs.</t>
2059
2060<t>The clipboard is another frequently used way to transfer URIs and
2061IRIs from one application to another. On most platforms, the clipboard
2062is able to store and transfer text in many languages and scripts.
2063Correctly used, the clipboard transfers characters, not octets, which
2064will do the right thing with IRIs.</t>
2065</section>
2066
2067<section title="URI/IRI Generation">
2068
2069<t>Systems that offer resources through the Internet, where those
2070resources have logical names, sometimes automatically generate URIs
2071for the resources they offer. For example, some HTTP servers can
2072generate a directory listing for a file directory and then respond to
2073the generated URIs with the files.</t>
2074
2075<t>Many legacy character encodings are in use in various file systems.
2076Many currently deployed systems do not transform the local character
2077representation of the underlying system before generating URIs.</t>
2078
2079<t>For maximum interoperability, systems that generate resource
2080identifiers should make the appropriate transformations. For example,
2081if a file system contains a file named
2082"r&amp;#xE9;sum&amp;#xE9;.html", a server should expose this as
2083"r%C3%A9sum%C3%A9.html" in a URI, which allows use of
2084"r&amp;#xE9;sum&amp;#xE9;.html" in an IRI, even if locally the file
2085name is kept in a character encoding other than UTF-8.
2086</t>
2087
2088<t>This recommendation particularly applies to HTTP servers. For FTP
2089servers, similar considerations apply; see <xref target="RFC2640"/>.</t>
2090</section>
2091
2092<section title="URI/IRI Selection" anchor="selection">
2093<t>In some cases, resource owners and publishers have control over the
2094IRIs used to identify their resources. This control is mostly
2095executed by controlling the resource names, such as file names,
2096directly.</t>
2097
2098<t>In these cases, it is recommended to avoid choosing IRIs that are
2099easily confused. For example, for US-ASCII, the lower-case ell ("l") is
2100easily confused with the digit one ("1"), and the upper-case oh ("O") is
2101easily confused with the digit zero ("0"). Publishers should avoid
2102confusing users with "br0ken" or "1ame" identifiers.</t>
2103
2104<t>Outside the US-ASCII repertoire, there are many more opportunities for
2105confusion; a complete set of guidelines is too lengthy to include
2106here. As long as names are limited to characters from a single script,
2107native writers of a given script or language will know best when
2108ambiguities can appear, and how they can be avoided. What may look
2109ambiguous to a stranger may be completely obvious to the average
2110native user. On the other hand, in some cases, the UCS contains
2111variants for compatibility reasons; for example, for typographic purposes.
2112These should be avoided wherever possible. Although there may be exceptions,
2113newly created resource names should generally be in NFKC
2114<xref target="UTR15"></xref> (which means that they are also in NFC).</t>
2115
2116<t>As an example, the UCS contains the "fi" ligature at U+FB01
2117for compatibility reasons.
2118Wherever possible, IRIs should use the two letters "f" and "i" rather
2119than the "fi" ligature. An example where the latter may be used is
2120in the query part of an IRI for an explicit search for a word written
2121containing the "fi" ligature.</t>
2122
2123<t>In certain cases, there is a chance that characters from different
2124scripts look the same. The best known example is the similarity of the
2125Latin "A", the Greek "Alpha", and the Cyrillic "A". To avoid such
2126cases, IRIs should only be created where all the characters in a
2127single component are used together in a given language. This usually
2128means that all of these characters will be from the same script, but
2129there are languages that mix characters from different scripts (such
2130as Japanese).  This is similar to the heuristics used to distinguish
2131between letters and numbers in the examples above. Also, for Latin,
2132Greek, and Cyrillic, using lowercase letters results in fewer
2133ambiguities than using uppercase letters would.</t>
2134</section>
2135
2136<section title="Display of URIs/IRIs" anchor="display">
2137<t>
2138In situations where the rendering software is not expected to display
2139non-ASCII parts of the IRI correctly using the available layout and font
2140resources, these parts should be percent-encoded before being displayed.</t>
2141
2142<t>For display of Bidi IRIs, please see <xref target="visual"/>.</t>
2143</section>
2144
2145<section title="Interpretation of URIs and IRIs">
2146<t>Software that interprets IRIs as the names of local resources should
2147accept IRIs in multiple forms and convert and match them with the
2148appropriate local resource names.</t>
2149
2150<t>First, multiple representations include both IRIs in the native
2151character encoding of the protocol and also their URI counterparts.</t>
2152
2153<t>Second, it may include URIs constructed based on character
2154encodings other than UTF-8. These URIs may be produced by user agents that do
2155not conform to this specification and that use legacy character encodings to
2156convert non-ASCII characters to URIs. Whether this is necessary, and what
2157character encodings to cover, depends on a number of factors, such as
2158the legacy character encodings used locally and the distribution of
2159various versions of user agents. For example, software for Japanese
2160may accept URIs in Shift_JIS and/or EUC-JP in addition to UTF-8.</t>
2161
2162<t>Third, it may include additional mappings to be more user-friendly
2163and robust against transmission errors. These would be similar to how
2164some servers currently treat URIs as case insensitive or perform
2165additional matching to account for spelling errors. For characters
2166beyond the US-ASCII repertoire, this may, for example, include
2167ignoring the accents on received IRIs or resource names. Please note
2168that such mappings, including case mappings, are language
2169dependent.</t>
2170
2171<t>It can be difficult to identify a resource unambiguously if too
2172many mappings are taken into consideration. However, percent-encoded
2173and not percent-encoded parts of IRIs can always be clearly distinguished.
2174Also, the regularity of UTF-8 (see <xref target="Duerst97"/>) makes the
2175potential for collisions lower than it may seem at first.</t>
2176</section>
2177
2178<section title="Upgrading Strategy">
2179<t>Where this recommendation places further constraints on software
2180for which many instances are already deployed, it is important to
2181introduce upgrades carefully and to be aware of the various
2182interdependencies.</t>
2183
2184<t>If IRIs cannot be interpreted correctly, they should not be created,
2185generated, or transported. This suggests that upgrading URI interpreting
2186software to accept IRIs should have highest priority.</t>
2187
2188<t>On the other hand, a single IRI is interpreted only by a single or
2189very few interpreters that are known in advance, although it may be
2190entered and transported very widely.</t>
2191
2192<t>Therefore, IRIs benefit most from a broad upgrade of software to be
2193able to enter and transport IRIs. However, before an
2194individual IRI is published, care should be taken to upgrade the corresponding
2195interpreting software in order to cover the forms expected to be
2196received by various versions of entry and transport software.</t>
2197
2198<t>The upgrade of generating software to generate IRIs instead of using a
2199local character encoding should happen only after the service is upgraded
2200to accept IRIs. Similarly, IRIs should only be generated when the service
2201accepts IRIs and the intervening infrastructure and protocol is known
2202to transport them safely.</t>
2203
2204<t>Software converting from URIs to IRIs for display should be upgraded
2205only after upgraded entry software has been widely deployed to the
2206population that will see the displayed result.</t>
2207
2208
2209<t>Where there is a free choice of character encodings, it is often
2210possible to reduce the effort and dependencies for upgrading to IRIs
2211by using UTF-8 rather than another encoding. For example, when a new
2212file-based Web server is set up, using UTF-8 as the character encoding
2213for file names will make the transition to IRIs easier. Likewise, when
2214a new Web form is set up using UTF-8 as the character encoding of the
2215form page, the returned query URIs will use UTF-8 as the character
2216encoding (unless the user, for whatever reason, changes the character
2217encoding) and will therefore be compatible with IRIs.</t>
2218
2219
2220<t>These recommendations, when taken together, will allow for the
2221extension from URIs to IRIs in order to handle characters other than
2222US-ASCII while minimizing interoperability problems. For
2223considerations regarding the upgrade of URI scheme definitions, see
2224<xref target="UTF8use"/>.</t>
2225
2226</section>
2227</section> <!-- guidelines -->
2228
2229<section title="IANA Considerations" anchor="iana">
2230
2231<t>RFC Editor and IANA note: Please Replace RFC XXXX with the
2232number of this document when it issues as an RFC. </t>
2233
2234<t>IANA maintains a registry of "URI schemes". A "URI scheme" also
2235serves an "IRI scheme". </t>
2236
2237<t>To clarify that the URI scheme registration process also applies to
2238IRIs, change the description of the "URI schemes" registry
2239header to say "[RFC4395] defines an IANA-maintained registry of URI
2240Schemes. These registries include the Permanent and Provisional URI
2241Schemes.  RFC XXXX updates this registry to designate that schemes may
2242also indicate their usability as IRI schemes.</t>
2243
2244<t> Update "per RFC 4395" to "per RFC 4395 and RFC XXXX".
2245</t>
2246
2247</section> <!-- IANA -->
2248   
2249<section title="Security Considerations" anchor="security">
2250<t>The security considerations discussed in <xref target="RFC3986"/>
2251also apply to IRIs. In addition, the following issues require
2252particular care for IRIs.</t>
2253<t>Incorrect encoding or decoding can lead to security problems.
2254In particular, some UTF-8 decoders do not check against overlong
2255byte sequences. As an example, a "/" is encoded with the byte 0x2F
2256both in UTF-8 and in US-ASCII, but some UTF-8 decoders also wrongly
2257interpret the sequence 0xC0 0xAF as a "/". A sequence such as "%C0%AF.."
2258may pass some security tests and then be interpreted
2259as "/.." in a path if UTF-8 decoders are fault-tolerant, if conversion
2260and checking are not done in the right order, and/or if reserved
2261characters and unreserved characters are not clearly distinguished.</t>
2262
2263<t>There are various ways in which "spoofing" can occur with IRIs.
2264"Spoofing" means that somebody may add a resource name that looks the
2265same or similar to the user, but that points to a different resource.
2266The added resource may pretend to be the real resource by looking
2267very similar but may contain all kinds of changes that may be
2268difficult to spot and that can cause all kinds of problems.
2269Most spoofing possibilities for IRIs are extensions of those for URIs.</t>
2270
2271<t>Spoofing can occur for various reasons. First, a user's normalization expectations or actual normalization
2272when entering an IRI or  transcoding an IRI from a legacy character
2273encoding do not match the normalization used on the
2274server side. Conceptually, this is no different from the problems
2275surrounding the use of case-insensitive web servers. For example,
2276a popular web page with a mixed-case name ("http://big.example.com/PopularPage.html")
2277might be "spoofed" by someone who is able to create "http://big.example.com/popularpage.html".
2278However, the use of unnormalized character sequences, and of additional
2279mappings for user convenience, may increase the chance for spoofing.
2280Protocols and servers that allow the creation of resources with
2281names that are not normalized are particularly vulnerable to such
2282attacks. This is an inherent
2283security problem of the relevant protocol, server, or resource
2284and is not specific to IRIs, but it is mentioned here for completeness.</t>
2285
2286<t>Spoofing can occur in various IRI components, such as the
2287domain name part or a path part. For considerations specific
2288to the domain name part, see <xref target="RFC3491"/>.
2289For the path part, administrators of sites that allow independent
2290users to create resources in the same sub area may have to be careful
2291to check for spoofing.</t>
2292
2293<t>Spoofing can occur because in the UCS many characters look very similar. Details are discussed in <xref target="selection"/>.
2294Again, this is very similar to spoofing possibilities on US-ASCII,
2295e.g., using "br0ken" or "1ame" URIs.</t>
2296
2297<t>Spoofing can occur when URIs with percent-encodings based on various
2298character encodings are accepted to deal with older user agents. In some
2299cases, particularly for Latin-based resource names, this is usually easy to
2300detect because UTF-8-encoded names, when interpreted and viewed as
2301legacy character encodings, produce mostly garbage.</t><t>When
2302concurrently used character encodings have a similar structure but there
2303are no characters that have exactly the same encoding, detection is more
2304difficult.</t>
2305
2306<t>Spoofing can occur with bidirectional IRIs, if the restrictions
2307in <xref target="bidi-structure"/> are not followed. The same visual
2308representation may be interpreted as different logical representations,
2309and vice versa. It is also very important that a correct Unicode bidirectional
2310implementation be used.</t><t>The use of Legacy Extended IRIs introduces additional security issues.</t>
2311</section><!-- security -->
2312
2313<section title="Acknowledgements">
2314<t>This document was derived from <xref target="RFC3987"/>; the acknowledgments from
2315that specification still apply.</t>
2316<t>We would like to thank Ian Hickson, Michael Sperberg-McQueen,
2317  and Dan Connolly for their work on HyperText References, and Norman Walsh, Richard Tobin,
2318  Henry S. Thomson, John Cowan, Paul Grosso, and the XML Core Working Group of the W3C for their work on LEIRIs.</t>
2319<t>In addition, this document was influenced by contributions from (in no particular order) Chris
2320  Lilley, Bjoern Hoehrmann,
2321Felix Sasaki, Jeremy Carroll, Frank Ellermann, Michael Everson, Cary Karp, Matitiahu Allouche,
2322Richard Ishida, Addison Phillips, Jonathan Rosenne, Najib Tounsi, Debbie Garside, Mark Davis, Sarmad
2323Hussain, Ted Hardie, Konrad Lanz, Thomas Roessler, Lisa Dusseault, Julian Reschke, Giovanni
2324Campagna, Anne van Kesteren, Mark Nottingham, Erik van der Poel, Marcin Hanclik, Marcos Caceres, Roy
2325Fielding, Greg Wilkins, Pieter Hintjens, Daniel R. Tobias, Marko Martin, Maciej Stanchowiak, Wil
2326Tan, Yui Naruse, Michael A. Puls II, Dave Thaler, Tom Perch, John Klensin, Shawn Steele, Peter
2327Saint-Andre, Geoffrey Sneddon, Chris Weber, Alex Melnikov, Slim Amamou, SM, Tim Berners-Lee, Yaron
2328Goland, Sam Ruby, Adam Barth, Abdulrahman I. ALGhadir, Aharon Lanin, Thomas Milo, Murray Sargent,
2329Marc Blanchet, and Mykyta Yevstifeyev.</t>
2330</section>
2331
2332<section title="Main Changes Since RFC 3987">
2333  <t>This section describes the main changes since <xref target="RFC3987"></xref>.</t>
2334  <section title="Major restructuring of IRI processing model" anchor="forkChanges">
2335    <t>Major restructuring of IRI processing model to make scheme-specific translation
2336      necessary to handle IDNA requirements and for consistency with web implementations. </t>
2337    <t>Starting with IRI, you want one of:
2338      <list style="hanging">
2339        <t hangText="a"> IRI components (IRI parsed into UTF8 pieces)</t>
2340        <t hangText="b"> URI components (URI parsed into ASCII pieces, encoded correctly) </t>
2341        <t hangText="c"> whole URI  (for passing on to some other system that wants whole URIs) </t>
2342      </list></t>
2343   
2344    <section title="OLD WAY">
2345      <t><list style="numbers">
2346       
2347        <t>Pct-encoding on the whole thing to a URI.
2348          (c1) If you want a (maybe broken) whole URI, you might
2349          stop here.</t>
2350       
2351        <t>Parsing the URI into URI components.
2352          (b1) If you want (maybe broken) URI components, stop here.</t>
2353       
2354        <t> Decode the components (undoing the pct-encoding).
2355          (a) if you want IRI components, stop here.</t>
2356       
2357        <t> reencode:  Either using a different encoding some components
2358          (for domain names, and query components in web pages, which
2359          depends on the component, scheme and context), and otherwise
2360          using pct-encoding.
2361          (b2) if you want (good) URI components, stop here.</t>
2362       
2363        <t> reassemble the reencoded components.
2364          (c2) if you want a (*good*) whole URI stop here.</t>
2365      </list>
2366       
2367      </t>
2368     
2369    </section>
2370   
2371    <section title="NEW WAY">
2372      <t>
2373        <list style="numbers">
2374         
2375          <t> Parse the IRI into IRI components using the generic syntax.
2376            (a) if you want IRI components, stop here.</t>
2377         
2378          <t> Encode each components, using pct-encoding, IDN encoding, or
2379            special query part encoding depending on the component
2380            scheme or context. (b) If you want URI components, stop here.</t>
2381          <t> reassemble the a whole URI from URI components.
2382            (c) if you want a whole URI stop here.</t>
2383        </list></t>
2384    </section>
2385    <section title="Extension of Syntax">
2386      <t>Added the tag range (U+E0000-E0FFF) to the iprivate production.
2387        Some IRIs generated with the new syntax may fail to pass very strict checks
2388        relying on the old syntax. But characters in this range should be extremely infrequent
2389        anyway.</t>
2390    </section>
2391    <section title="More to be added"><t>TODO: There are more main changes that need to be
2392      documented in this section.</t></section>
2393</section>
2394
2395<section title="Change Log">
2396
2397<t>Note to RFC Editor: Please completely remove this section before publication.</t>
2398
2399<section title='Changes after draft-ietf-iri-3987bis-01'>
2400    <t>Changes from draft-ietf-iri-3987bis-01 onwards are available as changesets
2401      in the IETF tools subversion repository at
2402      http://trac.tools.ietf.org/wg/iri/trac/log/draft-ietf-iri-3987bis/draft-ietf-iri-3987bis.xml.</t>
2403</section>
2404 
2405<section title='Changes from draft-duerst-iri-bis-07 to draft-ietf-iri-3987bis-00'>
2406     <t>Changed draft name, date, last paragraph of abstract, and titles in change log, and added this section
2407     in moving from draft-duerst-iri-bis-07 (personal submission) to draft-ietf-iri-3987bis-00 (WG document).</t>
2408</section>
2409
2410<section title="Changes from -06 to -07 of draft-duerst-iri-bis">
2411  <t>Major restructuring of the processing model, see <xref target="forkChanges"></xref>.</t>
2412</section>
2413</section>
2414
2415<section title='Changes from -00 to -01'><t><list style="symbols">
2416  <t>Removed 'mailto:' before mail addresses of authors.</t>
2417  <t>Added "&lt;to be done&gt;" as right side of 'href-strip' rule. Fixed '|' to '/' for
2418    alternatives.</t>
2419</list></t>
2420</section>
2421
2422<section title="Changes from -05 to -06 of draft-duerst-iri-bis-00"><t><list style="symbols">
2423<t>Add HyperText Reference, change abstract, acks and references for it</t>
2424<t>Add Masinter back as another editor.</t>
2425<t>Masinter integrates HRef material from HTML5 spec.</t>
2426<t>Rewrite introduction sections to modernize.</t>
2427</list></t>
2428</section>
2429
2430<section title="Changes from -04 to -05 of draft-duerst-iri-bis">
2431  <t><list style="symbols">
2432    <t>Updated references.</t>
2433    <t>Changed IPR text to pre5378Trust200902.</t></list></t>
2434</section>
2435
2436<section title="Changes from -03 to -04 of draft-duerst-iri-bis">
2437  <t><list style="symbols">
2438    <t>Added explicit abbreviation for LEIRIs.</t>
2439    <t>Mentioned LEIRI references.</t>
2440    <t>Completed text in LEIRI section about tag characters and about specials.</t></list></t>
2441</section>
2442
2443<section title="Changes from -02 to -03 of draft-duerst-iri-bis">
2444  <t><list style="symbols">
2445    <t>Updated some references.</t>
2446    <t>Updated Michel Suginard's coordinates.</t></list></t>
2447</section>
2448
2449<section title="Changes from -01 to -02 of draft-duerst-iri-bis">
2450  <t><list style="symbols">
2451    <t>Added tag range to iprivate (issue private-include-tags-115).</t>
2452    <t>Added Specials (U+FFF0-FFFD) to Legacy Extended IRIs.</t></list></t>
2453</section>
2454<section title="Changes from -00 to -01 of draft-duerst-iri-bis">
2455  <t><list style="symbols">
2456    <t>Changed from "IRIs with Spaces/Controls" to "Legacy Extended IRI"
2457      based on input from the W3C XML Core WG.
2458      Moved the relevant subsections to the back and promoted them to a section.</t>
2459    <t>Added some text re. Legacy Extended IRIs to the security section.</t>
2460    <t>Added a IANA Consideration Section.</t>
2461    <t>Added this Change Log Section.</t>
2462    <t>Added a section about "IRIs with Spaces/Controls" (converting from a Note in RFC 3987).</t></list></t>
2463</section>
2464<section title="Changes from RFC 3987 to -00 of draft-duerst-iri-bis">
2465  <t><list>
2466    <t>Fixed errata (see http://www.rfc-editor.org/cgi-bin/errataSearch.pl?rfc=3987).</t></list></t>
2467</section>
2468</section>
2469</middle>
2470
2471<back>
2472<references title="Normative References">
2473
2474<reference anchor="ASCII">
2475<front>
2476<title>Coded Character Set -- 7-bit American Standard Code for Information
2477Interchange</title>
2478<author>
2479<organization>American National Standards Institute</organization>
2480</author>
2481<date year="1986"/>
2482</front>
2483<seriesInfo name="ANSI" value="X3.4"/>
2484</reference>
2485
2486<reference anchor="ISO10646">
2487<front>
2488<title>ISO/IEC 10646:2003: Information Technology -
2489Universal Multiple-Octet Coded Character Set (UCS)</title>
2490<author>
2491<organization>International Organization for Standardization</organization>
2492</author>
2493<date month="December" year="2003"/>
2494</front>
2495<seriesInfo name="ISO" value="Standard 10646"/>
2496</reference>
2497
2498&rfc2119;
2499&rfc3490;
2500&rfc3491;
2501&rfc3629;
2502&rfc3986;
2503
2504<reference anchor="STD68">
2505<front>
2506<title abbrev="ABNF">Augmented BNF for Syntax Specifications: ABNF</title>
2507<author initials="D." surname="Crocker" fullname="Dave Crocker"><organization/></author>
2508<author initials="P." surname="Overell" fullname="Paul Overell"><organization/></author>
2509<date month="January" year="2008"/></front>
2510<seriesInfo name="STD" value="68"/><seriesInfo name="RFC" value="5234"/>
2511</reference>
2512 
2513&rfc5890;
2514&rfc5891;
2515
2516<reference anchor="UNIV6">
2517<front>
2518<title>The Unicode Standard, Version 6.0.0 (Mountain View, CA, The Unicode Consortium, 2011, ISBN 978-1-936213-01-6)</title>
2519<author><organization>The Unicode Consortium</organization></author>
2520<date year="2010" month="October"/>
2521</front>
2522</reference>
2523
2524<reference anchor="UNI9" target="http://www.unicode.org/reports/tr9/tr9-13.html">
2525<front>
2526<title>The Bidirectional Algorithm</title>
2527<author initials="M." surname="Davis" fullname="Mark Davis"><organization/></author>
2528<date year="2004" month="March"/>
2529</front>
2530<seriesInfo name="Unicode Standard Annex" value="#9"/>
2531</reference>
2532
2533<reference anchor="UTR15" target="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
2534<front>
2535<title>Unicode Normalization Forms</title>
2536<author initials="M." surname="Davis" fullname="Mark Davis"><organization/></author>
2537<author initials="M.J." surname="Duerst" fullname="Martin Duerst"><organization/></author>
2538<date year="2008" month="March"/>
2539</front>
2540<seriesInfo name="Unicode Standard Annex" value="#15"/>
2541</reference>
2542
2543</references>
2544
2545<references title="Informative References">
2546
2547<reference anchor="BidiEx" target="http://www.w3.org/International/iri-edit/BidiExamples">
2548<front>
2549<title>Examples of bidirectional IRIs</title>
2550<author><organization/></author>
2551<date year="" month=""/>
2552</front>
2553</reference>
2554
2555<reference anchor="CharMod" target="http://www.w3.org/TR/charmod-resid">
2556<front>
2557<title>Character Model for the World Wide Web: Resource Identifiers</title>
2558<author initials="M." surname="Duerst" fullname="Martin Duerst"><organization/></author>
2559<author initials="F." surname="Yergeau" fullname="Francois Yergeau"><organization/></author>
2560<author initials="R." surname="Ishida" fullname="Richard Ishida"><organization/></author>
2561<author initials="M." surname="Wolf" fullname="Misha Wolf"><organization/></author>
2562<author initials="T." surname="Texin" fullname="Tex Texin"><organization/></author>
2563<date year="2004" month="November" day="25"/>
2564</front>
2565<seriesInfo name="World Wide Web Consortium" value="Candidate Recommendation"/>
2566</reference>
2567
2568<reference anchor="Duerst97" target="http://www.ifi.unizh.ch/mml/mduerst/papers/PDF/IUC11-UTF-8.pdf">
2569<front>
2570<title>The Properties and Promises of UTF-8</title>
2571<author initials="M.J." surname="Duerst" fullname="Martin Duerst"><organization/></author>
2572<date year="1997" month="September"/>
2573</front>
2574<seriesInfo name="Proc. 11th International Unicode Conference, San Jose" value=""/>
2575</reference>
2576
2577<reference anchor="Gettys" target="http://www.w3.org/DesignIssues/ModelConsequences">
2578<front>
2579<title>URI Model Consequences</title>
2580<author initials="J." surname="Gettys" fullname="Jim Gettys"><organization/></author>
2581<date month="" year=""/>
2582</front>
2583</reference>
2584
2585<reference anchor="HTML4" target="http://www.w3.org/TR/html401/appendix/notes.html#h-B.2">
2586<front>
2587<title>HTML 4.01 Specification</title>
2588<author initials="D." surname="Raggett" fullname="Dave Raggett"><organization/></author>
2589<author initials="A." surname="Le Hors" fullname="Arnaud Le Hors"><organization/></author>
2590<author initials="I." surname="Jacobs" fullname="Ian Jacobs"><organization/></author>
2591<date year="1999" month="December" day="24"/>
2592</front>
2593<seriesInfo name="World Wide Web Consortium" value="Recommendation"/>
2594</reference>
2595
2596<reference anchor="LEIRI" target="http://www.w3.org/TR/leiri/">
2597<front>
2598<title>Legacy extended IRIs for XML resource identification</title>
2599<author initials="H." surname="Thompson" fullname="Henry Thompson"><organization/></author>
2600<author initials="R." surname="Tobin"    fullname="Richard Tobin"><organization/></author>
2601<author initials="N." surname="Walsh" fullname="Norman Walsh"><organization/></author>
2602  <date year="2008" month="November" day="3"/>
2603
2604</front>
2605<seriesInfo name="World Wide Web Consortium" value="Note"/>
2606</reference>
2607
2608
2609&rfc2045;
2610&rfc2130;
2611&rfc2141;
2612&rfc2192;
2613&rfc2277;
2614&rfc2368;
2615&rfc2384;
2616&rfc2396;
2617&rfc2397;
2618&rfc2616;
2619&rfc1738;
2620&rfc2640;
2621&rfc3987;
2622<reference anchor='RFC4395bis'>
2623  <front>
2624    <title>Guidelines and Registration Procedures for New URI/IRI Schemes</title>
2625    <author initials='T.' surname='Hansen' fullname="Tony Hansen"><organization/></author>
2626    <author initials='T.' surname='Hardie' fullname="Ted Hardie"><organization/></author>
2627    <author initials='L.' surname='Masinter' fullname="Larry Masinter"><organization/></author>
2628    <date year="2010" month='September' day="30"/>
2629    <workgroup>IRI</workgroup>
2630  </front>
2631  <seriesInfo name="Internet-Draft" value="draft-hansen-iri-4395bis-irireg-00"/>
2632</reference>
2633 
2634 
2635<reference anchor="UNIXML" target="http://www.w3.org/TR/unicode-xml/">
2636<front>
2637<title>Unicode in XML and other Markup Languages</title>
2638<author initials="M.J." surname="Duerst" fullname="Martin Duerst"><organization/></author>
2639<author initials="A." surname="Freytag" fullname="Asmus Freytag"><organization/></author>
2640<date year="2003" month="June" day="18"/>
2641</front>
2642<seriesInfo name="Unicode Technical Report" value="#20"/>
2643<seriesInfo name="World Wide Web Consortium" value="Note"/>
2644</reference>
2645 
2646<reference anchor="UTR36" target="http://unicode.org/reports/tr36/">
2647<front>
2648<title>Unicode Security Considerations</title>
2649<author initials="M." surname="Davis" fullname="Mark Davis"><organization/></author>
2650<author initials="M." surname="Suignard" fullname="Michel Suignard"><organization/></author>
2651<date year="2010" month="August" day="4"/>
2652</front>
2653<seriesInfo name="Unicode Technical Report" value="#36"/>
2654</reference>
2655
2656<reference anchor="XLink" target="http://www.w3.org/TR/xlink/#link-locators">
2657<front>
2658<title>XML Linking Language (XLink) Version 1.0</title>
2659<author initials="S." surname="DeRose" fullname="Steve DeRose"><organization/></author>
2660<author initials="E." surname="Maler" fullname="Eve Maler"><organization/></author>
2661<author initials="D." surname="Orchard" fullname="David Orchard"><organization/></author>
2662<date year="2001" month="June" day="27"/>
2663</front>
2664<seriesInfo name="World Wide Web Consortium" value="Recommendation"/>
2665</reference>
2666
2667<reference anchor="XML1" target="http://www.w3.org/TR/REC-xml">
2668  <front>
2669    <title>Extensible Markup Language (XML) 1.0 (Forth Edition)</title>
2670    <author initials="T." surname="Bray" fullname="Tim Bray"><organization/></author>
2671    <author initials="J." surname="Paoli" fullname="Jean Paoli"><organization/></author>
2672    <author initials="C.M." surname="Sperberg-McQueen" fullname="C. M. Sperberg-McQueen">
2673      <organization/></author>
2674    <author initials="E." surname="Maler" fullname="Eve Maler"><organization/></author>
2675    <author initials="F." surname="Yergeau" fullname="Francois Yergeau"><organization/></author>
2676    <date day="16" month="August" year="2006"/>
2677  </front>
2678  <seriesInfo name="World Wide Web Consortium" value="Recommendation"/>
2679</reference>
2680
2681<reference anchor="XMLNamespace" target="http://www.w3.org/TR/REC-xml-names">
2682  <front>
2683    <title>Namespaces in XML (Second Edition)</title>
2684    <author initials="T." surname="Bray" fullname="Tim Bray"><organization/></author>
2685    <author initials="D." surname="Hollander" fullname="Dave Hollander"><organization/></author>
2686    <author initials="A." surname="Layman" fullname="Andrew Layman"><organization/></author>
2687    <author initials="R." surname="Tobin" fullname="Richard Tobin"><organization></organization></author>
2688    <date day="16" month="August" year="2006"/>
2689  </front>
2690  <seriesInfo name="World Wide Web Consortium" value="Recommendation"/>
2691</reference>
2692
2693<reference anchor="XMLSchema" target="http://www.w3.org/TR/xmlschema-2/#anyURI">
2694<front>
2695<title>XML Schema Part 2: Datatypes</title>
2696<author initials="P." surname="Biron" fullname="Paul Biron"><organization/></author>
2697<author initials="A." surname="Malhotra" fullname="Ashok Malhotra"><organization/></author>
2698<date year="2001" month="May" day="2"/>
2699</front>
2700<seriesInfo name="World Wide Web Consortium" value="Recommendation"/>
2701</reference>
2702
2703<reference anchor="XPointer" target="http://www.w3.org/TR/xptr-framework/#escaping">
2704<front>
2705<title>XPointer Framework</title>
2706<author initials="P." surname="Grosso" fullname="Paul Grosso"><organization/></author>
2707<author initials="E." surname="Maler" fullname="Eve Maler"><organization/></author>
2708<author initials="J." surname="Marsh" fullname="Jonathan Marsh"><organization/></author>
2709<author initials="N." surname="Walsh" fullname="Norman Walsh"><organization/></author>
2710<date year="2003" month="March" day="25"/>
2711</front>
2712<seriesInfo name="World Wide Web Consortium" value="Recommendation"/>
2713</reference>
2714
2715<reference anchor="HTML5" target="http://www.w3.org/TR/2009/WD-html5-20090423/">
2716<front>
2717<title>A vocabulary and associated APIs for HTML and XHTML</title>
2718<author initials="I." surname="Hickson" fullname="Ian Hickson"><organization>Google, Inc.</organization></author>
2719<author initials="D." surname="Hyatt" fullname="David Hyatt"><organization>Apple, Inc.</organization></author>
2720<date year="2009"  month="April" day="23"/>
2721</front>
2722<seriesInfo name="World Wide Web Consortium" value="Working Draft"/>
2723</reference>
2724
2725</references>
2726
2727</back>
2728</rfc>
Note: See TracBrowser for help on using the repository browser.