src/viewer/ascii.c

src/viewer/ascii.c
Manual pages: mc • mcdiff • mcedit • mcview
/* */
This source file includes following definitions.
mcview_wcwidth
mcview_ismark
mcview_is_non_spacing_mark
mcview_is_spacing_mark
mcview_isprint
mcview_char_display
mcview_get_next_char
mcview_get_next_maybe_nroff_char
mcview_next_combining_char_sequence
mcview_display_line
mcview_display_paragraph
mcview_wrap_fixup
mcview_display_text
mcview_ascii_move_down
mcview_ascii_move_up
mcview_ascii_moveto_bol
mcview_ascii_moveto_eol
mcview_state_machine_init
   1 /*
   2    Internal file viewer for the Midnight Commander
   3    Function for plain view
   4 
   5    Copyright (C) 1994-2025
   6    Free Software Foundation, Inc.
   7 
   8    Written by:
   9    Miguel de Icaza, 1994, 1995, 1998
  10    Janne Kukonlehto, 1994, 1995
  11    Jakub Jelinek, 1995
  12    Joseph M. Hinkle, 1996
  13    Norbert Warmuth, 1997
  14    Pavel Machek, 1998
  15    Roland Illig <roland.illig@gmx.de>, 2004, 2005
  16    Slava Zanko <slavazanko@google.com>, 2009
  17    Andrew Borodin <aborodin@vmail.ru>, 2009-2022
  18    Ilia Maslakov <il.smind@gmail.com>, 2009
  19    Rewritten almost from scratch by:
  20    Egmont Koblinger <egmont@gmail.com>, 2014
  21 
  22    This file is part of the Midnight Commander.
  23 
  24    The Midnight Commander is free software: you can redistribute it
  25    and/or modify it under the terms of the GNU General Public License as
  26    published by the Free Software Foundation, either version 3 of the License,
  27    or (at your option) any later version.
  28 
  29    The Midnight Commander is distributed in the hope that it will be useful,
  30    but WITHOUT ANY WARRANTY; without even the implied warranty of
  31    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  32    GNU General Public License for more details.
  33 
  34    You should have received a copy of the GNU General Public License
  35    along with this program.  If not, see <https://www.gnu.org/licenses/>.
  36 
  37    ------------------------------------------------------------------------------------------------
  38 
  39    The viewer is implemented along the following design principles:
  40 
  41    Goals: Always display simple scripts, double wide (CJK), combining accents and spacing marks
  42    (often used e.g. in Devanagari) perfectly. Make the arrow keys always work correctly.
  43 
  44    Absolutely non-goal: RTL.
  45 
  46    Terminology:
  47 
  48    - A "paragraph" is the text between two adjacent newline characters. A "line" or "row" is a
  49    visual row on the screen. In wrap mode, the viewer formats a paragraph into one or more lines.
  50 
  51    - The Unicode glossary <https://www.unicode.org/glossary/> doesn't seem to have a notion of "base
  52    character followed by zero or more combining characters". The closest matches are "Combining
  53    Character Sequence" meaning a base character followed by one or more combining characters, or
  54    "Grapheme" which seems to exclude non-printable characters such as newline. In this file,
  55    "combining character sequence" (or any obvious abbreviation thereof) means a base character
  56    followed by zero or more (up to a current limit of 4) combining characters.
  57 
  58    ------------------------------------------------------------------------------------------------
  59 
  60    The parser-formatter is designed to be stateless across paragraphs. This is so that we can walk
  61    backwards without having to reparse the whole file (although we still need to reparse and
  62    reformat the whole paragraph, but it's a lot better). This principle needs to be changed if we
  63    ever get to address tickets 1849/2977, but then we can still store (for efficiency) the parser
  64    state at the beginning of the paragraph, and safely walk backwards if we don't cross an escape
  65    character.
  66 
  67    The parser-formatter, however, definitely needs to carry a state across lines. Currently this
  68    state contains:
  69 
  70    - The logical column (as if we didn't wrap). This is used for handling TAB characters after a
  71    wordwrap consistently with less.
  72 
  73    - Whether the last nroff character was bold or underlined. This is used for displaying the
  74    ambiguous _\b_ sequence consistently with less.
  75 
  76    - Whether the desired way of displaying a lonely combining accent or spacing mark is to place it
  77    over a dotted circle (we do this at the beginning of the paragraph of after a TAB), or to ignore
  78    the combining char and show replacement char for the spacing mark (we do this if e.g. too many
  79    of these were encountered and hence we don't glue them with their base character).
  80 
  81    - (This state needs to be expanded if e.g. we decide to print verbose replacement characters
  82    (e.g. "<U+0080>") and allow these to wrap around lines.)
  83 
  84    The state also contains the file offset, as it doesn't make sense to ever know the state without
  85    knowing the corresponding offset.
  86 
  87    The state depends on various settings (viewer width, encoding, nroff mode, charwrap or wordwrap
  88    mode (if we'll have that one day) etc.), needs to be recomputed if any of these changes.
  89 
  90    Walking forwards is usually relatively easy both in the file and on the screen. Walking
  91    backwards within a paragraph would only be possible in some special cases and even then it would
  92    be painful, so we always walk back to the beginning of the paragraph and reparse-reformat from
  93    there.
  94 
  95    (Walking back within a line in the file would have at least the following difficulties: handling
  96    the parser state; processing invalid UTF-8; processing invalid nroff (e.g. what is "_\bA\bA"?).
  97    Walking back on the display: we wouldn't know where to display the last line of a paragraph, or
  98    where to display a line if its following line starts with a wide (CJK or Tab) character. Long
  99    story short: just forget this approach.)
 100 
 101    Most important variables:
 102 
 103    - dpy_start: Both in unwrap and wrap modes this points to the beginning of the topmost displayed
 104    paragraph.
 105 
 106    - dpy_text_column: Only in unwrap mode, an additional horizontal scroll.
 107 
 108    - dpy_paragraph_skip_lines: Only in wrap mode, an additional vertical scroll (the number of
 109    lines that are scrolled off at the top from the topmost paragraph).
 110 
 111    - dpy_state_top: Only in wrap mode, the offset and parser-formatter state at the line where
 112    displaying the file begins is cached here.
 113 
 114    - dpy_wrap_dirty: If some parameter has changed that makes it necessary to reparse-redisplay the
 115    topmost paragraph.
 116 
 117    In wrap mode, the three variables "dpy_start", "dpy_paragraph_skip_lines" and "dpy_state_top"
 118    are kept consistent. Think of the first two as the ones describing the position, and the third
 119    as a cached value for better performance so that we don't need to wrap the invisible beginning
 120    of the topmost paragraph over and over again. The third value needs to be recomputed each time a
 121    parameter that influences parsing or displaying the file (e.g. width of screen, encoding, nroff
 122    mode) changes, this is signaled by "dpy_wrap_dirty" to force recomputing "dpy_state_top" (and
 123    clamp "dpy_paragraph_skip_lines" if necessary).
 124 
 125    ------------------------------------------------------------------------------------------------
 126 
 127    Help integration
 128 
 129    I'm planning to port the help viewer to this codebase.
 130 
 131    Splitting at sections would still happen in the help viewer. It would either copy a section, or
 132    set force_max and a similar force_min to limit displaying to one section only.
 133 
 134    Parsing the help format would go next to the nroff parser. The colors, alternate character set,
 135    and emitting the version number would go to the "state". (The version number would be
 136    implemented by emitting remaining characters of a buffer in the "state" one by one, without
 137    advancing in the file position.)
 138 
 139    The active link would be drawn similarly to the search highlight. Other than that, the viewer
 140    wouldn't care about links (except for their color). help.c would keep track of which one is
 141    highlighted, how to advance to the next/prev on an arrow, how the scroll offset needs to be
 142    adjusted when moving, etc.
 143 
 144    Add wrapping at word boundaries to where wrapping at char boundaries happens now.
 145  */
 146 
 147 #include <config.h>
 148 
 149 #include "lib/global.h"
 150 #include "lib/tty/tty.h"
 151 #include "lib/skin.h"
 152 #include "lib/util.h"  // is_printable()
 153 #include "lib/charsets.h"
 154 
 155 #include "src/setup.h"  // option_tab_spacing
 156 
 157 #include "internal.h"
 158 
 159 /*** global variables ****************************************************************************/
 160 
 161 /*** file scope macro definitions ****************************************************************/
 162 
 163 /* The Unicode standard recommends that lonely combining characters are printed over a dotted
 164  * circle. If the terminal is not UTF-8, this will be replaced by a dot anyway. */
 165 #define BASE_CHARACTER_FOR_LONELY_COMBINING 0x25CC  // dotted circle
 166 #define MAX_COMBINING_CHARS                 4       // both slang and ncurses support exactly 4
 167 
 168 /* I think anything other than space (e.g. arrows) just introduce visual clutter without actually
 169  * adding value. */
 170 #define PARTIAL_CJK_AT_LEFT_MARGIN  ' '
 171 #define PARTIAL_CJK_AT_RIGHT_MARGIN ' '
 172 
 173 /*
 174  * Wrap mode: This is for safety so that jumping to the end of file (which already includes
 175  * scrolling back by a page) and then walking backwards is reasonably fast, even if the file is
 176  * extremely large and consists of maybe full zeros or something like that. If there's no newline
 177  * found within this limit, just start displaying from there and see what happens. We might get
 178  * some displaying parameters (most importantly the columns) incorrect, but at least will show the
 179  * file without spinning the CPU for ages. When scrolling back to that point, the user might see a
 180  * garbled first line (even starting with an invalid partial UTF-8), but then walking back by yet
 181  * another line should fix it.
 182  *
 183  * Unwrap mode: This is not used, we wouldn't be able to do anything reasonable without walking
 184  * back a whole paragraph (well, view->data_area.height paragraphs actually).
 185  */
 186 #define MAX_BACKWARDS_WALK_IN_PARAGRAPH (100 * 1000)
 187 
 188 /*** file scope type declarations ****************************************************************/
 189 
 190 /*** forward declarations (file scope functions) *************************************************/
 191 
 192 /*** file scope variables ************************************************************************/
 193 
 194 /* --------------------------------------------------------------------------------------------- */
 195 /*** file scope functions ************************************************************************/
 196 /* --------------------------------------------------------------------------------------------- */
 197 
 198 /* TODO: These methods shouldn't be necessary, see ticket 3257 */
 199 
 200 static int
 201 mcview_wcwidth (const WView *view, int c)
     /*   */
 202 {
 203     if (view->utf8)
 204     {
 205         if (g_unichar_iswide (c))
 206             return 2;
 207         if (g_unichar_iszerowidth (c))
 208             return 0;
 209     }
 210 
 211     return 1;
 212 }
 213 
 214 /* --------------------------------------------------------------------------------------------- */
 215 
 216 static inline gboolean
 217 mcview_ismark (const WView *view, int c)
     /*   */
 218 {
 219     return (view->utf8 && g_unichar_ismark (c));
 220 }
 221 
 222 /* --------------------------------------------------------------------------------------------- */
 223 
 224 /* actually is_non_spacing_mark_or_enclosing_mark */
 225 static gboolean
 226 mcview_is_non_spacing_mark (const WView *view, int c)
     /*   */
 227 {
 228     if (view->utf8)
 229     {
 230         const GUnicodeType type = g_unichar_type (c);
 231 
 232         return type == G_UNICODE_NON_SPACING_MARK || type == G_UNICODE_ENCLOSING_MARK;
 233     }
 234 
 235     return FALSE;
 236 }
 237 
 238 /* --------------------------------------------------------------------------------------------- */
 239 
 240 #if 0
 241 static gboolean
 242 mcview_is_spacing_mark (const WView *view, int c)
     /*   */
 243 {
 244     return (view->utf8 && g_unichar_type (c) == G_UNICODE_SPACING_MARK);
 245 }
 246 #endif
 247 
 248 /* --------------------------------------------------------------------------------------------- */
 249 
 250 static gboolean
 251 mcview_isprint (const WView *view, int c)
     /*   */
 252 {
 253     if (!view->utf8)
 254         c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
 255     return g_unichar_isprint (c);
 256 }
 257 
 258 /* --------------------------------------------------------------------------------------------- */
 259 
 260 static int
 261 mcview_char_display (const WView *view, int c, char *s)
     /*   */
 262 {
 263     if (mc_global.utf8_display)
 264     {
 265         if (!view->utf8)
 266             c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
 267         if (!g_unichar_isprint (c))
 268             c = '.';
 269         return g_unichar_to_utf8 (c, s);
 270     }
 271     if (view->utf8)
 272     {
 273         if (g_unichar_iswide (c))
 274         {
 275             s[0] = s[1] = '.';
 276             return 2;
 277         }
 278         if (g_unichar_iszerowidth (c))
 279             return 0;
 280         // TODO the is_printable check below will be broken for this
 281         c = convert_from_utf_to_current_c (c, view->converter);
 282     }
 283     else
 284     {
 285         // TODO the is_printable check below will be broken for this
 286         c = convert_to_display_c (c);
 287     }
 288 
 289     // TODO this is very-very buggy by design: ticket 3257 comments 0-1
 290     if (!is_printable (c))
 291         c = '.';
 292     *s = c;
 293     return 1;
 294 }
 295 
 296 /* --------------------------------------------------------------------------------------------- */
 297 
 298 /**
 299  * Just for convenience, a common interface in front of mcview_get_utf and mcview_get_byte, so that
 300  * the caller doesn't have to care about utf8 vs 8-bit modes.
 301  *
 302  * Normally: stores c, updates state, returns TRUE.
 303  * At EOF: state is unchanged, c is undefined, returns FALSE.
 304  *
 305  * Just as with mcview_get_utf(), invalid UTF-8 is reported using negative integers.
 306  *
 307  * Also, temporary hack: handle force_max here.
 308  * TODO: move it to lower layers (datasource.c)?
 309  */
 310 static gboolean
 311 mcview_get_next_char (WView *view, mcview_state_machine_t *state, int *c)
     /*   */
 312 {
 313     // Pretend EOF if we reached force_max
 314     if (view->force_max >= 0 && state->offset >= view->force_max)
 315         return FALSE;
 316 
 317     if (view->utf8)
 318     {
 319         int char_length = 0;
 320 
 321         if (!mcview_get_utf (view, state->offset, c, &char_length))
 322             return FALSE;
 323         // Pretend EOF if we crossed force_max
 324         if (view->force_max >= 0 && state->offset + char_length > view->force_max)
 325             return FALSE;
 326 
 327         state->offset += char_length;
 328         return TRUE;
 329     }
 330 
 331     if (!mcview_get_byte (view, state->offset, c))
 332         return FALSE;
 333     state->offset++;
 334     return TRUE;
 335 }
 336 
 337 /* --------------------------------------------------------------------------------------------- */
 338 /**
 339  * This function parses the next nroff character and gives it to you along with its desired color,
 340  * so you never have to care about nroff again.
 341  *
 342  * The nroff mode does the backspace trick for every single character (Unicode codepoint). At least
 343  * that's what the GNU groff 1.22 package produces, and that's what less 458 expects. For
 344  * double-wide characters (CJK), still only a single backspace is emitted. For combining accents
 345  * and such, the print-backspace-print step is repeated for the base character and then for each
 346  * accent separately.
 347  *
 348  * So, the right place for this layer is after the bytes are interpreted in UTF-8, but before
 349  * joining a base character with its combining accents.
 350  *
 351  * Normally: stores c and color, updates state, returns TRUE.
 352  * At EOF: state is unchanged, c and color are undefined, returns FALSE.
 353  *
 354  * color can be null if the caller doesn't care.
 355  */
 356 static gboolean
 357 mcview_get_next_maybe_nroff_char (WView *view, mcview_state_machine_t *state, int *c, int *color)
     /*   */
 358 {
 359     mcview_state_machine_t state_after_nroff;
 360     int c2, c3;
 361 
 362     if (color != NULL)
 363         *color = VIEW_NORMAL_COLOR;
 364 
 365     if (!view->mode_flags.nroff)
 366         return mcview_get_next_char (view, state, c);
 367 
 368     if (!mcview_get_next_char (view, state, c))
 369         return FALSE;
 370     // Don't allow nroff formatting around CR, LF, TAB or other special chars
 371     if (!mcview_isprint (view, *c))
 372         return TRUE;
 373 
 374     state_after_nroff = *state;
 375 
 376     if (!mcview_get_next_char (view, &state_after_nroff, &c2))
 377         return TRUE;
 378     if (c2 != '\b')
 379         return TRUE;
 380 
 381     if (!mcview_get_next_char (view, &state_after_nroff, &c3))
 382         return TRUE;
 383     if (!mcview_isprint (view, c3))
 384         return TRUE;
 385 
 386     if (*c == '_' && c3 == '_')
 387     {
 388         *state = state_after_nroff;
 389         if (color != NULL)
 390             *color =
 391                 state->nroff_underscore_is_underlined ? VIEW_UNDERLINED_COLOR : VIEW_BOLD_COLOR;
 392     }
 393     else if (*c == c3)
 394     {
 395         *state = state_after_nroff;
 396         state->nroff_underscore_is_underlined = FALSE;
 397         if (color != NULL)
 398             *color = VIEW_BOLD_COLOR;
 399     }
 400     else if (*c == '_')
 401     {
 402         *c = c3;
 403         *state = state_after_nroff;
 404         state->nroff_underscore_is_underlined = TRUE;
 405         if (color != NULL)
 406             *color = VIEW_UNDERLINED_COLOR;
 407     }
 408 
 409     return TRUE;
 410 }
 411 
 412 /* --------------------------------------------------------------------------------------------- */
 413 /**
 414  * Get one base character, along with its combining or spacing mark characters.
 415  *
 416  * (A spacing mark is a character that extends the base character's width 1 into a combined
 417  * character of width 2, yet these two character cells should not be separated. E.g. Devanagari
 418  * <U+0939><U+094B>.)
 419  *
 420  * This method exists mainly for two reasons. One is to be able to tell if we fit on the current
 421  * line or need to wrap to the next one. The other is that both slang and ncurses seem to require
 422  * that the character and its combining marks are printed in a single call (or is it just a
 423  * limitation of mc's wrapper to them?).
 424  *
 425  * For convenience, this method takes care of converting CR or CR+LF into LF.
 426  * TODO this should probably happen later, when displaying the file?
 427  *
 428  * Normally: stores cs and color, updates state, returns >= 1 (entries in cs).
 429  * At EOF: state is unchanged, cs and color are undefined, returns 0.
 430  *
 431  * @param view ...
 432  * @param state the parser-formatter state machine's state, updated
 433  * @param cs store the characters here
 434  * @param clen the room available in cs (that is, at most clen-1 combining marks are allowed), must
 435  *   be at least 2
 436  * @param color if non-NULL, store the color here, taken from the first codepoint's color
 437  * @return the number of entries placed in cs, or 0 on EOF
 438  */
 439 static int
 440 mcview_next_combining_char_sequence (WView *view, mcview_state_machine_t *state, int *cs, int clen,
     /*   */
 441                                      int *color)
 442 {
 443     int i = 1;
 444 
 445     if (!mcview_get_next_maybe_nroff_char (view, state, cs, color))
 446         return 0;
 447 
 448     // Process \r and \r\n newlines.
 449     if (cs[0] == '\r')
 450     {
 451         int cnext;
 452 
 453         mcview_state_machine_t state_after_crlf = *state;
 454         if (mcview_get_next_maybe_nroff_char (view, &state_after_crlf, &cnext, NULL)
 455             && cnext == '\n')
 456             *state = state_after_crlf;
 457         cs[0] = '\n';
 458         return 1;
 459     }
 460 
 461     // We don't want combining over non-printable characters. This includes '\n' and '\t' too.
 462     if (!mcview_isprint (view, cs[0]))
 463         return 1;
 464 
 465     if (mcview_ismark (view, cs[0]))
 466     {
 467         if (!state->print_lonely_combining)
 468         {
 469             // First character is combining. Either just return it, ...
 470             return 1;
 471         }
 472         else
 473         {
 474             // or place this (and subsequent combining ones) over a dotted circle.
 475             cs[1] = cs[0];
 476             cs[0] = BASE_CHARACTER_FOR_LONELY_COMBINING;
 477             i = 2;
 478         }
 479     }
 480 
 481     if (mcview_wcwidth (view, cs[0]) == 2)
 482     {
 483         // Don't allow combining or spacing mark for wide characters, is this okay?
 484         return 1;
 485     }
 486 
 487     /* Look for more combining chars. Either at most clen-1 zero-width combining chars,
 488      * or at most 1 spacing mark. Is this logic correct? */
 489     for (; i < clen; i++)
 490     {
 491         mcview_state_machine_t state_after_combining;
 492 
 493         state_after_combining = *state;
 494         if (!mcview_get_next_maybe_nroff_char (view, &state_after_combining, &cs[i], NULL))
 495             return i;
 496         if (!mcview_ismark (view, cs[i]) || !mcview_isprint (view, cs[i]))
 497             return i;
 498         if (g_unichar_type (cs[i]) == G_UNICODE_SPACING_MARK)
 499         {
 500             // Only allow as the first combining char. Stop processing in either case.
 501             if (i == 1)
 502             {
 503                 *state = state_after_combining;
 504                 i++;
 505             }
 506             return i;
 507         }
 508         *state = state_after_combining;
 509     }
 510     return i;
 511 }
 512 
 513 /* --------------------------------------------------------------------------------------------- */
 514 /**
 515  * Parse, format and possibly display one visual line of text.
 516  *
 517  * Formatting starts at the given "state" (which encodes the file offset and parser and formatter's
 518  * internal state). In unwrap mode, this should point to the beginning of the paragraph with the
 519  * default state, the additional horizontal scrolling is added here. In wrap mode, this should
 520  * point to the beginning of the line, with the proper state at that point.
 521  *
 522  * In wrap mode, if a line ends in a newline, it is consumed, even if it's exactly at the right
 523  * edge. In unwrap mode, the whole remaining line, including the newline is consumed. Displaying
 524  * the next line should start at "state"'s new value, or if we displayed the bottom line then
 525  * state->offset tells the file offset to be shown in the top bar.
 526  *
 527  * If "row" is offscreen, don't actually display the line but still update "state" and return the
 528  * proper value. This is used by mcview_wrap_move_down to advance in the file.
 529  *
 530  * @param view ...
 531  * @param state the parser-formatter state machine's state, updated
 532  * @param row print to this row
 533  * @param paragraph_ended store TRUE if paragraph ended by newline or EOF, FALSE if wraps to next
 534  *   line
 535  * @param linewidth store the width of the line here
 536  * @return the number of rows, that is, 0 if we were already at EOF, otherwise 1
 537  */
 538 static int
 539 mcview_display_line (WView *view, mcview_state_machine_t *state, int row, gboolean *paragraph_ended,
     /*   */
 540                      off_t *linewidth)
 541 {
 542     const WRect *r = &view->data_area;
 543     off_t dpy_text_column = view->mode_flags.wrap ? 0 : view->dpy_text_column;
 544     int col = 0;
 545     int cs[1 + MAX_COMBINING_CHARS];
 546     char str[(1 + MAX_COMBINING_CHARS) * MB_LEN_MAX + 1];
 547     int i, j;
 548 
 549     if (paragraph_ended != NULL)
 550         *paragraph_ended = TRUE;
 551 
 552     if (!view->mode_flags.wrap && (row < 0 || row >= r->lines) && linewidth == NULL)
 553     {
 554         /* Optimization: Fast forward to the end of the line, rather than carefully
 555          * parsing and then not actually displaying it. */
 556         off_t eol;
 557         int retval;
 558 
 559         eol = mcview_eol (view, state->offset);
 560         retval = (eol > state->offset) ? 1 : 0;
 561 
 562         mcview_state_machine_init (state, eol);
 563         return retval;
 564     }
 565 
 566     while (TRUE)
 567     {
 568         int charwidth = 0;
 569         mcview_state_machine_t state_saved;
 570         int n;
 571         int color;
 572 
 573         state_saved = *state;
 574         n = mcview_next_combining_char_sequence (view, state, cs, 1 + MAX_COMBINING_CHARS, &color);
 575         if (n == 0)
 576         {
 577             if (linewidth != NULL)
 578                 *linewidth = col;
 579             return (col > 0) ? 1 : 0;
 580         }
 581 
 582         if (view->search_start <= state->offset && state->offset < view->search_end)
 583             color = VIEW_SELECTED_COLOR;
 584 
 585         if (cs[0] == '\n')
 586         {
 587             // New line: reset all formatting state for the next paragraph.
 588             mcview_state_machine_init (state, state->offset);
 589             if (linewidth != NULL)
 590                 *linewidth = col;
 591             return 1;
 592         }
 593 
 594         if (mcview_is_non_spacing_mark (view, cs[0]))
 595         {
 596             // Lonely combining character. Probably leftover after too many combining chars. Just
 597             // ignore.
 598             continue;
 599         }
 600 
 601         // Nonprintable, or lonely spacing mark
 602         if ((!mcview_isprint (view, cs[0]) || mcview_ismark (view, cs[0])) && cs[0] != '\t')
 603             cs[0] = '.';
 604 
 605         for (i = 0; i < n; i++)
 606             charwidth += mcview_wcwidth (view, cs[i]);
 607 
 608         /* Adjust the width for TAB. It's handled below along with the normal characters,
 609          * so that it's wrapped consistently with them, and is painted with the proper
 610          * attributes (although currently it can't have a special color). */
 611         if (cs[0] == '\t')
 612         {
 613             charwidth = option_tab_spacing - state->unwrapped_column % option_tab_spacing;
 614             state->print_lonely_combining = TRUE;
 615         }
 616         else
 617             state->print_lonely_combining = FALSE;
 618 
 619         /* In wrap mode only: We're done with this row if the character sequence wouldn't fit.
 620          * Except if at the first column, because then it wouldn't fit in the next row either.
 621          * In this extreme case let the unwrapped code below do its best to display it. */
 622         if (view->mode_flags.wrap && (off_t) col + charwidth > dpy_text_column + (off_t) r->cols
 623             && col > 0)
 624         {
 625             *state = state_saved;
 626             if (paragraph_ended != NULL)
 627                 *paragraph_ended = FALSE;
 628             if (linewidth != NULL)
 629                 *linewidth = col;
 630             return 1;
 631         }
 632 
 633         // Display, unless outside of the viewport.
 634         if (row >= 0 && row < r->lines)
 635         {
 636             if ((off_t) col >= dpy_text_column
 637                 && (off_t) col + charwidth <= dpy_text_column + (off_t) r->cols)
 638             {
 639                 // The combining character sequence fits entirely in the viewport. Print it.
 640                 tty_setcolor (color);
 641                 widget_gotoyx (view, r->y + row, r->x + ((off_t) col - dpy_text_column));
 642                 if (cs[0] == '\t')
 643                 {
 644                     for (i = 0; i < charwidth; i++)
 645                         tty_print_char (' ');
 646                 }
 647                 else
 648                 {
 649                     j = 0;
 650                     for (i = 0; i < n; i++)
 651                         j += mcview_char_display (view, cs[i], str + j);
 652                     str[j] = '\0';
 653                     /* This is probably a bug in our tty layer, but tty_print_string
 654                      * normalizes the string, whereas tty_printf doesn't. Don't normalize,
 655                      * since we handle combining characters ourselves correctly, it's
 656                      * better if they are copy-pasted correctly. Ticket 3255. */
 657                     tty_printf ("%s", str);
 658                 }
 659             }
 660             else if ((off_t) col < dpy_text_column && (off_t) col + charwidth > dpy_text_column)
 661             {
 662                 /* The combining character sequence would cross the left edge of the viewport.
 663                  * This cannot happen with wrap mode. Print replacement character(s),
 664                  * or spaces with the correct attributes for partial Tabs. */
 665                 tty_setcolor (color);
 666                 for (i = dpy_text_column;
 667                      i < (off_t) col + charwidth && i < dpy_text_column + (off_t) r->cols; i++)
 668                 {
 669                     widget_gotoyx (view, r->y + row, r->x + (i - dpy_text_column));
 670                     tty_print_anychar ((cs[0] == '\t') ? ' ' : PARTIAL_CJK_AT_LEFT_MARGIN);
 671                 }
 672             }
 673             else if ((off_t) col < dpy_text_column + (off_t) r->cols
 674                      && (off_t) col + charwidth > dpy_text_column + (off_t) r->cols)
 675             {
 676                 /* The combining character sequence would cross the right edge of the viewport
 677                  * and we're not wrapping. Print replacement character(s),
 678                  * or spaces with the correct attributes for partial Tabs. */
 679                 tty_setcolor (color);
 680                 for (i = col; i < dpy_text_column + (off_t) r->cols; i++)
 681                 {
 682                     widget_gotoyx (view, r->y + row, r->x + (i - dpy_text_column));
 683                     tty_print_anychar ((cs[0] == '\t') ? ' ' : PARTIAL_CJK_AT_RIGHT_MARGIN);
 684                 }
 685             }
 686         }
 687 
 688         col += charwidth;
 689         state->unwrapped_column += charwidth;
 690 
 691         if (!view->mode_flags.wrap && (off_t) col >= dpy_text_column + (off_t) r->cols
 692             && linewidth == NULL)
 693         {
 694             /* Optimization: Fast forward to the end of the line, rather than carefully
 695              * parsing and then not actually displaying it. */
 696             off_t eol;
 697 
 698             eol = mcview_eol (view, state->offset);
 699             mcview_state_machine_init (state, eol);
 700             return 1;
 701         }
 702     }
 703 }
 704 
 705 /* --------------------------------------------------------------------------------------------- */
 706 /**
 707  * Parse, format and possibly display one paragraph (perhaps not from the beginning).
 708  *
 709  * Formatting starts at the given "state" (which encodes the file offset and parser and formatter's
 710  * internal state). In unwrap mode, this should point to the beginning of the paragraph with the
 711  * default state, the additional horizontal scrolling is added here. In wrap mode, this may point
 712  * to the beginning of the line within a paragraph (to display the partial paragraph at the top),
 713  * with the proper state at that point.
 714  *
 715  * Displaying the next paragraph should start at "state"'s new value, or if we displayed the bottom
 716  * line then state->offset tells the file offset to be shown in the top bar.
 717  *
 718  * If "row" is negative, don't display the first abs(row) lines and display the rest from the top.
 719  * This was a nice idea but it's now unused :)
 720  *
 721  * If "row" is too large, don't display the paragraph at all but still return the number of lines.
 722  * This is used when moving upwards.
 723  *
 724  * @param view ...
 725  * @param state the parser-formatter state machine's state, updated
 726  * @param row print starting at this row
 727  * @return the number of rows the paragraphs is wrapped to, that is, 0 if we were already at EOF,
 728  *   otherwise 1 in unwrap mode, >= 1 in wrap mode. We stop when reaching the bottom of the
 729  *   viewport, it's not counted how many more lines the paragraph would occupy
 730  */
 731 static int
 732 mcview_display_paragraph (WView *view, mcview_state_machine_t *state, int row)
     /*   */
 733 {
 734     int lines = 0;
 735 
 736     while (TRUE)
 737     {
 738         gboolean paragraph_ended;
 739 
 740         lines += mcview_display_line (view, state, row, &paragraph_ended, NULL);
 741         if (paragraph_ended)
 742             return lines;
 743 
 744         if (row < view->data_area.lines)
 745         {
 746             row++;
 747             // stop if bottom of screen reached
 748             if (row >= view->data_area.lines)
 749                 return lines;
 750         }
 751     }
 752 }
 753 
 754 /* --------------------------------------------------------------------------------------------- */
 755 /**
 756  * Recompute dpy_state_top from dpy_start and dpy_paragraph_skip_lines. Clamp
 757  * dpy_paragraph_skip_lines if necessary.
 758  *
 759  * This method should be called in wrap mode after changing one of the parsing or formatting
 760  * properties (e.g. window width, encoding, nroff), or when switching to wrap mode from unwrap or
 761  * hex.
 762  *
 763  * If we stayed within the same paragraph then try to keep the vertical offset within that
 764  * paragraph as well. It might happen though that the paragraph became shorter than our desired
 765  * vertical position, in that case move to its last row.
 766  */
 767 static void
 768 mcview_wrap_fixup (WView *view)
     /*   */
 769 {
 770     int lines = view->dpy_paragraph_skip_lines;
 771 
 772     if (!view->dpy_wrap_dirty)
 773         return;
 774     view->dpy_wrap_dirty = FALSE;
 775 
 776     view->dpy_paragraph_skip_lines = 0;
 777     mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
 778 
 779     while (lines-- != 0)
 780     {
 781         mcview_state_machine_t state_prev;
 782         gboolean paragraph_ended;
 783 
 784         state_prev = view->dpy_state_top;
 785         if (mcview_display_line (view, &view->dpy_state_top, -1, &paragraph_ended, NULL) == 0)
 786             break;
 787         if (paragraph_ended)
 788         {
 789             view->dpy_state_top = state_prev;
 790             break;
 791         }
 792         view->dpy_paragraph_skip_lines++;
 793     }
 794 }
 795 
 796 /* --------------------------------------------------------------------------------------------- */
 797 /*** public functions ****************************************************************************/
 798 /* --------------------------------------------------------------------------------------------- */
 799 
 800 /**
 801  * In both wrap and unwrap modes, dpy_start points to the beginning of the paragraph.
 802  *
 803  * In unwrap mode, start displaying from this position, probably applying an additional horizontal
 804  * scroll.
 805  *
 806  * In wrap mode, an additional dpy_paragraph_skip_lines lines are skipped from the top of this
 807  * paragraph. dpy_state_top contains the position and parser-formatter state corresponding to the
 808  * top left corner so we can just start rendering from here. Unless dpy_wrap_dirty is set in which
 809  * case dpy_state_top is invalid and we need to recompute first.
 810  */
 811 void
 812 mcview_display_text (WView *view)
     /*   */
 813 {
 814     const WRect *r = &view->data_area;
 815     int row;
 816     mcview_state_machine_t state;
 817     gboolean again;
 818 
 819     do
 820     {
 821         int n;
 822 
 823         again = FALSE;
 824 
 825         mcview_display_clean (view);
 826         mcview_display_ruler (view);
 827 
 828         if (!view->mode_flags.wrap)
 829             mcview_state_machine_init (&state, view->dpy_start);
 830         else
 831         {
 832             mcview_wrap_fixup (view);
 833             state = view->dpy_state_top;
 834         }
 835 
 836         for (row = 0; row < r->lines; row += n)
 837         {
 838             n = mcview_display_paragraph (view, &state, row);
 839             if (n == 0)
 840             {
 841                 /* In the rare case that displaying didn't start at the beginning
 842                  * of the file, yet there are some empty lines at the bottom,
 843                  * scroll the file and display again. This happens when e.g. the
 844                  * window is made bigger, or the file becomes shorter due to
 845                  * charset change or enabling nroff. */
 846                 if ((view->mode_flags.wrap ? view->dpy_state_top.offset : view->dpy_start) > 0)
 847                 {
 848                     mcview_ascii_move_up (view, r->lines - row);
 849                     again = TRUE;
 850                 }
 851                 break;
 852             }
 853         }
 854     }
 855     while (again);
 856 
 857     view->dpy_end = state.offset;
 858     view->dpy_state_bottom = state;
 859 
 860     tty_setcolor (VIEW_NORMAL_COLOR);
 861     if (mcview_show_eof != NULL && mcview_show_eof[0] != '\0')
 862         while (row < r->lines)
 863         {
 864             widget_gotoyx (view, r->y + row, r->x);
 865             // TODO: should make it no wider than the viewport
 866             tty_print_string (mcview_show_eof);
 867             row++;
 868         }
 869 }
 870 
 871 /* --------------------------------------------------------------------------------------------- */
 872 /**
 873  * Move down.
 874  *
 875  * It's very simple. Just invisibly format the next "lines" lines, carefully carrying the formatter
 876  * state in wrap mode. But before each step we need to check if we've already hit the end of the
 877  * file, in that case we can no longer move. This is done by walking from dpy_state_bottom.
 878  *
 879  * Note that this relies on mcview_display_text() setting dpy_state_bottom to its correct value
 880  * upon rendering the screen contents. So don't call this function from other functions (e.g. at
 881  * the bottom of mcview_ascii_move_up()) which invalidate this value.
 882  */
 883 void
 884 mcview_ascii_move_down (WView *view, off_t lines)
     /*   */
 885 {
 886     while (lines-- != 0)
 887     {
 888         gboolean paragraph_ended;
 889 
 890         /* See if there's still data below the bottom line, by imaginarily displaying one
 891          * more line. This takes care of reading more data into growbuf, if required.
 892          * If the end position didn't advance, we're at EOF and hence bail out. */
 893         if (mcview_display_line (view, &view->dpy_state_bottom, -1, &paragraph_ended, NULL) == 0)
 894             break;
 895 
 896         /* Okay, there's enough data. Move by 1 row at the top, too. No need to check for
 897          * EOF, that can't happen. */
 898         if (!view->mode_flags.wrap)
 899         {
 900             view->dpy_start = mcview_eol (view, view->dpy_start);
 901             view->dpy_paragraph_skip_lines = 0;
 902             view->dpy_wrap_dirty = TRUE;
 903         }
 904         else
 905         {
 906             mcview_display_line (view, &view->dpy_state_top, -1, &paragraph_ended, NULL);
 907             if (!paragraph_ended)
 908                 view->dpy_paragraph_skip_lines++;
 909             else
 910             {
 911                 view->dpy_start = view->dpy_state_top.offset;
 912                 view->dpy_paragraph_skip_lines = 0;
 913             }
 914         }
 915     }
 916 }
 917 
 918 /* --------------------------------------------------------------------------------------------- */
 919 /**
 920  * Move up.
 921  *
 922  * Unwrap mode: Piece of cake. Wrap mode: If we'd walk back more than the current line offset
 923  * within the paragraph, we need to jump back to the previous paragraph and compute its height to
 924  * see if we start from that paragraph, and repeat this if necessary. Once we're within the desired
 925  * paragraph, we still need to format it from its beginning to know the state.
 926  *
 927  * See the top of this file for comments about MAX_BACKWARDS_WALK_IN_PARAGRAPH.
 928  *
 929  * force_max is a nice protection against the rare extreme case that the file underneath us
 930  * changes, we don't want to endlessly consume a file of maybe full of zeros upon moving upwards.
 931  */
 932 void
 933 mcview_ascii_move_up (WView *view, off_t lines)
     /*   */
 934 {
 935     if (!view->mode_flags.wrap)
 936     {
 937         while (lines-- != 0)
 938             view->dpy_start = mcview_bol (view, view->dpy_start - 1, 0);
 939         view->dpy_paragraph_skip_lines = 0;
 940         view->dpy_wrap_dirty = TRUE;
 941     }
 942     else
 943     {
 944         int i;
 945 
 946         while (lines > view->dpy_paragraph_skip_lines)
 947         {
 948             // We need to go back to the previous paragraph.
 949             if (view->dpy_start == 0)
 950             {
 951                 // Oops, we're already in the first paragraph.
 952                 view->dpy_paragraph_skip_lines = 0;
 953                 mcview_state_machine_init (&view->dpy_state_top, 0);
 954                 return;
 955             }
 956             lines -= view->dpy_paragraph_skip_lines;
 957             view->force_max = view->dpy_start;
 958             view->dpy_start = mcview_bol (view, view->dpy_start - 1,
 959                                           view->dpy_start - MAX_BACKWARDS_WALK_IN_PARAGRAPH);
 960             mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
 961             /* This is a tricky way of denoting that we're at the end of the paragraph.
 962              * Normally we'd jump to the next paragraph and reset paragraph_skip_lines. But for
 963              * walking backwards this is exactly what we need. */
 964             view->dpy_paragraph_skip_lines =
 965                 mcview_display_paragraph (view, &view->dpy_state_top, view->data_area.lines);
 966             view->force_max = -1;
 967         }
 968 
 969         /* Okay, we have have dpy_start pointing to the desired paragraph, and we still need to
 970          * walk back "lines" lines from the current "dpy_paragraph_skip_lines" offset. We can't do
 971          * that, so walk from the beginning of the paragraph. */
 972         mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
 973         view->dpy_paragraph_skip_lines -= lines;
 974         for (i = 0; i < view->dpy_paragraph_skip_lines; i++)
 975             mcview_display_line (view, &view->dpy_state_top, -1, NULL, NULL);
 976     }
 977 }
 978 
 979 /* --------------------------------------------------------------------------------------------- */
 980 
 981 void
 982 mcview_ascii_moveto_bol (WView *view)
     /*   */
 983 {
 984     if (!view->mode_flags.wrap)
 985         view->dpy_text_column = 0;
 986 }
 987 
 988 /* --------------------------------------------------------------------------------------------- */
 989 
 990 void
 991 mcview_ascii_moveto_eol (WView *view)
     /*   */
 992 {
 993     if (!view->mode_flags.wrap)
 994     {
 995         mcview_state_machine_t state;
 996         off_t linewidth;
 997 
 998         // Get the width of the topmost paragraph.
 999         mcview_state_machine_init (&state, view->dpy_start);
1000         mcview_display_line (view, &state, -1, NULL, &linewidth);
1001         view->dpy_text_column = DOZ (linewidth, (off_t) view->data_area.cols);
1002     }
1003 }
1004 
1005 /* --------------------------------------------------------------------------------------------- */
1006 
1007 void
1008 mcview_state_machine_init (mcview_state_machine_t *state, off_t offset)
     /*   */
1009 {
1010     memset (state, 0, sizeof (*state));
1011     state->offset = offset;
1012     state->print_lonely_combining = TRUE;
1013 }
1014 
1015 /* --------------------------------------------------------------------------------------------- */
/* */
root/src/viewer/ascii.c

DEFINITIONS